def lassolarscv(): print ("Doing cross-validated LassoLars") cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0) clf5 = LassoLarsCV(cv=cross_val) clf5.fit(base_X, base_Y) print ("Score = %f" % clf5.score(base_X, base_Y)) clf5_pred = clf5.predict(X_test) write_to_file("lassolars.csv", clf5_pred)
def lasso_regr(wine_set): pred = wine_set[["density", 'alcohol', 'sulphates', 'pH', 'volatile_acidity', 'chlorides', 'fixed_acidity', 'citric_acid', 'residual_sugar', 'free_sulfur_dioxide', 'total_sulfur_dioxide']] predictors = pred.copy() targets = wine_set.quality # standardize predictors to have mean=0 and sd=1 predictors = pd.DataFrame(preprocessing.scale(predictors)) predictors.columns = pred.columns # print(predictors.head()) # split into training and testing sets pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.3, random_state=123) # specify the lasso regression model model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train) print('Predictors and their regression coefficients:') d = dict(zip(predictors.columns, model.coef_)) for k in d: print(k, ':', d[k]) # plot coefficient progression m_log_alphas = -np.log10(model.alphas_) # ax = plt.gca() plt.plot(m_log_alphas, model.coef_path_.T) print('\nAlpha:', model.alpha_) plt.axvline(-np.log10(model.alpha_), linestyle="dashed", color='k', label='alpha CV') plt.ylabel("Regression coefficients") plt.xlabel("-log(alpha)") plt.title('Regression coefficients progression for Lasso paths') plt.show() # plot mean squared error for each fold m_log_alphascv = -np.log10(model.cv_alphas_) plt.plot(m_log_alphascv, model.cv_mse_path_, ':') plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.show() # Mean squared error from training and test data train_error = mean_squared_error(tar_train, model.predict(pred_train)) test_error = mean_squared_error(tar_test, model.predict(pred_test)) print('\nMean squared error for training data:', train_error) print('Mean squared error for test data:', test_error) rsquared_train = model.score(pred_train, tar_train) rsquared_test = model.score(pred_test, tar_test) print('\nR-square for training data:', rsquared_train) print('R-square for test data:', rsquared_test)
def lasso(X,y,value): regressor = LassoLarsCV(cv = 10, precompute = False) regressor.fit(X,y) y_pred = regressor.predict(value) return y_pred
plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent ' '(train time: %.2fs)' % t_lasso_cv) plt.axis('tight') plt.ylim(ymin, ymax) ############################################################################## # LassoLarsCV: least angle regression # Compute paths print("Computing regularization path using the Lars lasso...") t1 = time.time() model = LassoLarsCV(cv=20).fit(X, y) t_lasso_lars_cv = time.time() - t1 # Display results m_log_alphas = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphas, model.cv_mse_path_, ':') plt.plot(m_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error')
def QuickML_Ensembling(X_train, y_train, X_test, y_test='', modeltype='Regression', Boosting_Flag=False, scoring='', verbose=0): """ Quickly builds and runs multiple models for a clean data set(only numerics). """ start_time = time.time() seed = 99 if len(X_train) <= 100000 or X_train.shape[1] < 50: NUMS = 100 FOLDS = 5 else: NUMS = 200 FOLDS = 10 ## create Voting models estimators = [] if modeltype == 'Regression': if scoring == '': scoring = 'neg_mean_squared_error' scv = ShuffleSplit(n_splits=FOLDS, random_state=seed) if Boosting_Flag is None: model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = rmse(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Bagging1', model5, metrics1)) else: model5 = LassoLarsCV(cv=scv) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = rmse(results1, y_test).mean() else: metrics1 = 0 estimators.append(('LassoLarsCV Regression', model5, metrics1)) model6 = LassoCV(alphas=np.logspace(-10, -1, 50), cv=scv, random_state=seed) results2 = model6.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics2 = rmse(results2, y_test).mean() else: metrics2 = 0 estimators.append(('LassoCV Regularization', model6, metrics2)) model7 = RidgeCV(alphas=np.logspace(-10, -1, 50), cv=scv) results3 = model7.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics3 = rmse(results3, y_test).mean() else: metrics3 = 0 estimators.append(('RidgeCV Regression', model7, metrics3)) ## Create an ensemble model #### if Boosting_Flag: model8 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = rmse(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Bagging2', model8, metrics4)) else: model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( min_samples_leaf=2, max_depth=1, random_state=seed), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = rmse(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Boosting', model8, metrics4)) estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] estimator_names = [tuples[0] for tuples in estimators] if verbose >= 2: print('QuickML_Ensembling Model results:') print( ' %s = %0.4f \n %s = %0.4f\n %s = %0.4f \n %s = %0.4f' % (estimator_names[0], metrics1, estimator_names[1], metrics2, estimator_names[2], metrics3, estimator_names[3], metrics4)) else: if scoring == '': scoring = 'accuracy' scv = StratifiedKFold(n_splits=FOLDS, random_state=seed) if Boosting_Flag is None: model5 = ExtraTreesClassifier(n_estimators=NUMS, min_samples_leaf=2, random_state=seed) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = accu(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Bagging', model5, metrics1)) else: model5 = LogisticRegressionCV(Cs=np.linspace(0.01, 100, 20), cv=scv, scoring=scoring, random_state=seed) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = accu(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Logistic Regression', model5, metrics1)) model6 = LinearDiscriminantAnalysis() results2 = model6.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics2 = accu(results2, y_test).mean() else: metrics2 = 0 estimators.append(('Linear Discriminant', model6, metrics2)) if modeltype == 'Binary_Classification': if (X_train < 0).astype(int).sum().sum() > 0: model7 = DecisionTreeClassifier(max_depth=5) else: model7 = GaussianNB() else: if (X_train < 0).astype(int).sum().sum() > 0: model7 = DecisionTreeClassifier(max_depth=5) else: model7 = MultinomialNB() results3 = model7.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics3 = accu(results3, y_test).mean() else: metrics3 = 0 estimators.append(('Naive Bayes', model7, metrics3)) if Boosting_Flag: #### If the Boosting_Flag is True, it means Boosting model is present. So choose a Bagging here. model8 = ExtraTreesClassifier(n_estimators=NUMS, min_samples_leaf=2, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = accu(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Bagging', model8, metrics4)) else: ## Create an ensemble model #### model8 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( random_state=seed, max_depth=1, min_samples_leaf=2), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = accu(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Boosting', model8, metrics4)) estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] estimator_names = [tuples[0] for tuples in estimators] if not isinstance(y_test, str): if verbose >= 2: print('QuickML_Ensembling Model results:') print( ' %s = %0.4f \n %s = %0.4f\n %s = %0.4f \n %s = %0.4f' % (estimator_names[0], metrics1, estimator_names[1], metrics2, estimator_names[2], metrics3, estimator_names[3], metrics4)) else: if verbose >= 1: print('QuickML_Ensembling completed.') stacks = np.c_[results1, results2, results3, results4] if verbose == 1: print(' Time taken for Ensembling: %0.1f seconds' % (time.time() - start_time)) return estimator_names, stacks #########################################################
def main(): u"""Main function for assignment 03.""" # Load prepared data. df = return_proc_and_transf_data_set() # Mass is already included as mass in SI units. df.drop(['carat'], inplace=True, axis=1) # Those are dummy variables not needed in our data set anymore. df.drop(['price_expensive', 'price_expensive_binary'], inplace=True, axis=1) # A bit of error checking. if df.isnull().sum().sum() != 0: raise ValueError('Your data has unintended nulls.') # Cast our dataframe into float type. df = df.astype('float64') # Scale our dataframe to avoid the sparsity control of our dataframe biased # against some variables. print('Prior to scaling:') print(df.describe()) df = df.apply(preprocessing.scale) print('After scaling:') print(df.describe()) print_separator() if (df.mean().abs() > 1e-3).sum() > 0: raise ValueError('Scaling of your dataframe went wrong.') # Split into training and testing sets # The predictirs should not include any price variable since this was used # to create the output variable predictors = [x for x in df.columns.tolist() if 'price' not in x] print('Input variables:') pprint(predictors, indent=4) input_variables = df[predictors].copy() output_variable = df.price.copy() # Categorized price print_separator() input_training, input_test, output_training, output_test = train_test_split( input_variables, output_variable, test_size=0.3, random_state=0) # A few words about the LassoLarsCV: # LASSO: least absolute shrinkage and selection operator (discussed in # the course material. # LARS: least angle regression: algorithm for linear regression models # to high-dimensional data (aka 'a lot of categories'). # Compared to simple LASSO this model uses the LARS algorithm instead of # the 'vanilla' 'coordinate_descent' of simple LASSO. # CV: cross validation: this sets the alpha parameter (refered to as # lambda parameter in the course video) by cross validation. # In the simple LARS this alpha (the penalty factor) is an input of the # function. # 'The alpha parameter controls the degree of sparsity of the # coefficients estimated. # If alpha = zero then the method is the same as OLS. model = LassoLarsCV( cv=10, # Number of folds. precompute=False, # Do not precompute Gram matrix. # precompute=True, # Do not precompute Gram matrix. # verbose=3, ).fit(input_training, output_training) dict_var_lin_coefs = dict(zip( predictors, model.coef_)) print('Result of linear model:') pprint(sorted([(k, v) for k, v in dict_var_lin_coefs.items()], key=lambda x: abs(x[1])) ) print_separator() # Plot coefficient progression. # TODO: plot those on 4 different subplots. model_log_alphas = -np.log10(model.alphas_) ax = plt.gca() plt.plot(model_log_alphas, model.coef_path_.T) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.ylabel('Regression Coefficients') plt.xlabel('-log(alpha)') plt.title('Regression Coefficients Progression for Lasso Paths') plt.legend(predictors, loc='best',) plt.tight_layout() plt.savefig('result00.png', dpi=600) plt.close() # TODO: why are the coefficients in the result very different than the # coefficient path? # # There seems to be a scaling of the coefficient paths with an arbitrary # almost the same constant (194 in this case) # # print('Resulting alpha is not different than path alpha (difference):') # difference = model.alpha_ - model.alphas_ # pprint(model.alpha_ - model.alphas_) # print('Resulting coefficients are very different than path coefficients (difference):') # pprint(model.coef_ - model.coef_path_.T) # print_separator() # Plot mean square error for each fold. # To avoid getting dividebyzero warning map zero to an extremely low value. model.cv_alphas_ = list( map(lambda x: x if x != 0 else np.inf, model.cv_alphas_)) model_log_alphas = -np.log10(model.cv_alphas_) plt.figure() plt.plot(model_log_alphas, model.cv_mse_path_, ':') plt.plot(model_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.xlabel('-log(alpha)') plt.ylabel('Mean squared error') plt.title('Mean squared error on each fold') plt.legend() plt.tight_layout() plt.savefig('result01.png', dpi=600) plt.close() # Mean squared error of our model. train_error = mean_squared_error(output_training, model.predict(input_training)) test_error = mean_squared_error(output_test, model.predict(input_test)) print ('Training data MSE') print(train_error) print ('Test data MSE') print(test_error) print_separator() # R-square from training and test data. rsquared_train = model.score( input_training, output_training) rsquared_test = model.score( input_test, output_test) print ('Training data R-square') print(rsquared_train) print ('Test data R-square') print(rsquared_test) print_separator() return {'model': model, 'dataframe': df}
if "Auto" in datasets: build_auto(AdaBoostRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), random_state = 13, n_estimators = 17), "AdaBoostAuto") build_auto(ARDRegression(normalize = True), "BayesianARDAuto") build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAuto", compact = False) build_auto(BaggingRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(cv = 3, random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAuto") build_auto(GBDTLMRegressor(RandomForestRegressor(n_estimators = 7, max_depth = 6, random_state = 13), LinearRegression()), "GBDTLMAuto") build_auto(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 6, random_state = 13), ElasticNet(random_state = 13)), "XGBRFLMAuto") build_auto(GradientBoostingRegressor(init = None, random_state = 13), "GradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(cv = 3), "LarsAuto") build_auto(LassoCV(cv = 3, random_state = 13), "LassoAuto") build_auto(LassoLarsCV(cv = 3), "LassoLarsAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), max_features = 0.75, random_state = 13), "LinearRegressionEnsembleAuto") build_auto(OrthogonalMatchingPursuitCV(cv = 3), "OMPAuto") build_auto(RandomForestRegressor(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAuto", flat = True) build_auto(RidgeCV(), "RidgeAuto") build_auto(StackingRegressor([("ridge", Ridge(random_state = 13)), ("lasso", Lasso(random_state = 13))], final_estimator = GradientBoostingRegressor(n_estimators = 7, random_state = 13)), "StackingEnsembleAuto") build_auto(TheilSenRegressor(n_subsamples = 31, random_state = 13), "TheilSenAuto") build_auto(VotingRegressor([("dt", DecisionTreeRegressor(random_state = 13)), ("knn", KNeighborsRegressor()), ("lr", LinearRegression())], weights = [3, 1, 2]), "VotingEnsembleAuto") build_auto(XGBRFRegressor(n_estimators = 31, max_depth = 6, random_state = 13), "XGBRFAuto") if "Auto" in datasets: build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto") build_auto(TransformedTargetRegressor(LinearRegression(), func = numpy.log, inverse_func = numpy.exp), "TransformedLinearRegressionAuto") def build_auto_hist(regressor, name):
def fit(self): # 1. construct a placeholder called 'qhat_k_container' for the list of all q_hat^k (defined in Algorithm 2) of each subsample qhat_k_container = list() # 2. estimate q_hat^k (for the solution path) on each subsample and save them as elements of the placeholder for j in range(self.n_repeat): # a. randomly choose a subset of sample points (whose index is 'index_subsample') that is used to generate a subsample in each repeat index_subsample = np.random.choice(self.train_size, self.subsample_size, replace=False) # b. based on 'index_subsample', take the corresponding observations of X out and save them as the subample X_subsample = self.X_so[index_subsample] # c. based on 'index_subsample', take the corresponding observations of Y out and save them as the subample y_subsample = self.y_so[index_subsample] # d. scikit-learn requires 'y_subsample' to be an one-dimension array y_subsample.shape = (y_subsample.shape[0], ) # e. given a subsample, compute q_hat^k (the solution path) using lars # e(1). call the class 'Lars' trial_1 = Lars(n_nonzero_coefs=min(X_subsample.shape[1] + 1, X_subsample.shape[0] + 1)) # e(2). fit lars on the subsample trial_1.fit(X_subsample, y_subsample) # e(3). save the active set of lars (indices of variables select by lars) as 'active'. active = trial_1.active_ # f. The active set of lars is ranked based on the chronology of variable inclusion at different stages of lars. For example [2,1,3] means x_2 is included at stage 1, x_1 is included at stage 2 and x_3 is included at stage 3. Based on the active set of lars, we compute q_hat^k (defined as 'qhat_k' in code) as defined in Algorithm 2 # f(1). we generate 'qhat_k' as an array of zeros; qhat_k = np.zeros((1, self.n_dim)) # f(2). we compute the i-th value of q_hat^k for the corresponding variable based on Algorithm 2; replace i-th term in 'qhat_k' with the value we just compute for i in active: qhat_k[0, i] = 1 - \ (np.where(np.array(active) == i)[0][0]) / (self.n_dim) # f(3). we append the result into 'qhat_k_container' as one element of the list qhat_k_container.append(qhat_k) # 3. if self.lasso == True, we compute CV-lars-lasso and CV-cd on the original sample X and Y (not on the subsample) if (self.lasso == True): # a(1). call the class for CV-lars-lasso (called LassoLarsCV in Scikit-learn) # a(2). we set the number of folds in CV as 10 trial_2 = LassoLarsCV(cv=10) # b. change y into one-dimensional array (required by Scikit-learn) yy = self.y yy.shape = (self.sample_size, ) # c. fit CV-lars-lasso on X and Y trial_2.fit(self.X, yy) # d. save 'la_list' as the number of variables in the active set of CV-lars-lasso la_list = len(trial_2.active_) # e. save 'la_vari_list' as the active set of CV-lars-lasso la_vari_list = trial_2.active_ # f. call the class for CV-cd (called LassoCV in Scikit-learn) # f(1). we set the number of folds in CV as 10 # f(2). for reproduction, we fix the random seed of training-validation split in CV (random_state=0) trial_3 = LassoCV(cv=10, random_state=0) # g. fit cv-cd on X and Y trial_3.fit(self.X, yy) # h. save 'cd_list' as the number of variables in the active set of CV-cd cd_list = np.count_nonzero(trial_3.coef_) # i. save 'cd_vari_list' as the active set of CV-cd cd_vari_list = np.nonzero(trial_3.coef_)[0] # 4. compute q_hat and Q(c) (defined in Algorithm 2) # a(1). we transform the list of all q_hat^k ('qhat_k_container') into a matrix ('qhat_k_container_matrix') # a(2). row of the matrix: the q_hat^k on a given subsample for all variables # a(3). colum of the matrix: the corresponding value of q_hat^k for a given variable on all subsamples qhat_k_container_matrix = np.concatenate(qhat_k_container, axis=0) # b. compute the the value of qhat for each variable (qhat defined in Algorithm 2 of the paper) qhat_value = np.mean(qhat_k_container_matrix, axis=0) # c. set 'Qc_list' as the container of Q(c) for all value of c Qc_list = list() # d. set 'c_seq' as the sequence of c for the grid search of c* in solar c_seq = np.arange(max(qhat_value), 0.1, self.step_size) # e. generate Q(c) for each value of c for j in c_seq: # e(1). define 'container' as the placeholder of Q(c) when c == j; container = list() for i in range(self.X.shape[1]): # e(2). include all variables into 'container' if their corresponding values in q-hat are larger or equal to j; if (qhat_value[i] >= j): container.append(i) # e(3). append 'container' (Q(c) when c == j) into 'Qc_list' (the container of Q(c) for all value of c); Qc_list.append(container) # 5. compute the test error of each value of c # we use grid search on test set to choose c*; # for each value of c in the grid search, train a OLS of Y_so on the variables of Q(c) in X_so (Y_so and X_so defined at the begining); # a. container for test errors test_error = list() # b. compute the test error of each Q(c) on test set # b(0). set i as the indices of all variables in Q(c) for a given value of c; for i in Qc_list: # b(1). call the LinearRegression class; OLS_1 = LinearRegression() # b(2). compute OLS of Y_so on the variables in Q(c) in X_so; OLS_1.fit(self.X_so[:, i], self.y_so) # b(3). compute the L2 prediction error of OLS on test set (X_test, y_test); s1 = costs_com(self.X_test[:, i], self.y_test, OLS_1) loss_test_1, _ = s1.L2() # b(4). save the L2 error as the test error of Q(c) for each value of c; append it into the container of test errors; test_error.append(loss_test_1) # 6. tuning c via grid search # 6(a). transform 'test_error' from a list into an array; test_error = np.asarray(test_error) # 6(b). save the location of minimum of 'test_error' as 'min_loc_val'; min_loc_val = np.where(test_error == min(test_error))[0] # 6(c). save the correpsonding value of c (c*) as 'opt_c'; opt_c = c_seq[min_loc_val] # 6(d). find Q(c*) and save it as 'Q_opt_c'; Q_opt_c = Qc_list[max(min_loc_val)] # 7. Regression of Y onto the selected variables ( Q(c*) ) in X # 7(a). call the LinearRegression class; OLS_2 = LinearRegression() # 7(b). fit OLS of Y on the variables of Q(c*) in X; OLS_2.fit(self.X[:, Qc_list[max(min_loc_val)]], self.y) # 7(c). set 'solar_coef' (an array of zeros) as the placeholder of solar regression coefficents solar_coef = np.zeros([self.n_dim, 1]) # 7(d). put the estimated regression coefficents into their corresponding place of 'solar_coef' solar_coef[Q_opt_c, 0] = OLS_2.coef_ # 8. define la_list, la_vari_list as empty list if self.lasso != True (if we don't want to compute cv-lars-lasso and cv-cd) if (self.lasso != True): la_list = [] la_vari_list = [] cd_list = [] cd_vari_list = [] return solar_coef, opt_c, test_error, Qc_list, la_list, la_vari_list, cd_list, cd_vari_list
targets_test_data = [] s = wbtesttarget.sheet_by_index(0) testtarget = s.col(0) for row in range(0, s.nrows): value = (s.cell(row, 0).value) targets_test_data.append(value) test_targets = targets_test_data ############################################################################### # LassoLarsCV: least angle regression # Compute paths print("Computing regularization path using the Lars lasso...") t1 = time.time() model = LassoLarsCV(cv=30).fit(train_features, train_targets) t_lasso_lars_cv = time.time() - t1 # Display results m_log_alphas = -np.log10(model.cv_alphas_) plt.figure() #plt.figure(figsize=(32,18), dpi=1200) # used to expose the figure at higher resolution plt.plot(m_log_alphas, model.cv_mse_path_, ':') plt.plot(m_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)')
else: return 0 subset['High Income'] = subset['absolute_deviations'].apply(high_income_flag) """ " ========================== Build LASSO Regression ========================== """ predictors = subset[variables] targets = subset['High Income'] #Split into training and testing sets training_data, test_data, training_target, test_target = train_test_split(predictors, targets, test_size=.3) # Build the LASSO regression model model=LassoLarsCV(cv=10, precompute=False).fit(training_data, training_target) """ " ========================== Evaluate LASSO Model ============================ """ # print variable names and regression coefficients feature_name = list(predictors.columns.values) feature_coefficient = list(model.coef_) features = pd.DataFrame({'Variable':feature_name, 'Regression Coefficients':feature_coefficient}).sort_values(by='Regression Coefficients', ascending=False) print(features.head(len(feature_name))) #print(dict(zip(predictors.columns, model.coef_))) # plot coefficient progression m_log_alphas = -np.log10(model.alphas_) ax = plt.gca()
ivars = [] ivars2 = [] depvars = [] columns = [] for pyear in player_years: ivars.append([pt_projs[pyear][system] for system in proj_systems]) depvars.append(pt_actuals[pyear]['actual']) for pyear in pt_projs_curr.keys(): ivars2.append([pt_projs_curr[pyear][system] for system in proj_systems]) x = numpy.array(ivars) x2 = numpy.array(ivars2) y = numpy.array(depvars) model_pt = LassoLarsCV(cv=cv_num) model_pt.fit(x,y) print("Rough PT model, to choose sample") for system, coef in zip(proj_systems, model_pt.coef_): print("%40s : %f" % (system, coef)) print("%40s : %f" % ('intercept', model_pt.intercept_)) sample_proj_pt_arr = model_pt.predict(x) curr_proj_pt_arr = model_pt.predict(x2) sample_proj_pt = dict(zip(player_years,sample_proj_pt_arr)) curr_proj_pt = dict(zip(pt_projs_curr.keys(),curr_proj_pt_arr)) models = {}
for i, mask in enumerate(MASK): X = X_all[mask,:][:,keep] y = y_train[mask] N_SEG.append(X.shape[0]) # parameters search range #param_ridge_post = list(np.arange(200,400,10)) #param_ridge_post.append(0.5) param_ridge_post= np.concatenate((np.arange(0.1,1,0.1),np.arange(3,5,0.1))) #param_ridge_post = [330, 0.5] #p=24489 #param_ridge_post = [3.7, 0.5] #p=303 # fit from sklearn.linear_model import LassoLarsCV from sklearn import linear_model lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000, eps= 2.2204460492503131e-16,copy_X=True, cv=5, n_jobs=2) lasso_cv.fit(X, y) """ normalize=True, lasso seems to be able to handle itself """ lasso_refit = linear_model.LassoLars(alpha=lasso_cv.alpha_, fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, eps=2.2204460492503131e-16, copy_X=True, fit_path=False) lasso_refit.fit(X, y) active = lasso_refit.coef_ for i, x in enumerate(active[0]): if x != 0 and i > main.shape[1] - 1:
def lasso_train(groups, varname='valence', arrayname='norm', alpha=None, use_lars=True, fit_intercept=True, normalize=True, cv_folds=None, cv_repeats=None, skip_cv=False, xmin=-np.inf, xmax=np.inf, _larch=None, **kws): """use a list of data groups to train a Lasso/LassoLars model Arguments --------- groups list of groups to use as components varname name of characteristic value to model ['valence'] arrayname string of array name to be fit (see Note 3) ['norm'] xmin x-value for start of fit range [-inf] xmax x-value for end of fit range [+inf] alpha alpha parameter for LassoLars (See Note 5) [None] use_lars bool to use LassoLars instead of Lasso [True] cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] skip_cv bool to skip doing Cross-Validation [None] Returns ------- group with trained LassoLars model, to be used with lasso_predict Notes ----- 1. The group members for the components must match each other in data content and array names. 2. all grouops must have an attribute (scalar value) for `varname` 3. arrayname can be one of `norm` or `dmude` 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 will be used (rounded). 5. alpha is the regularization parameter. if alpha is None it will be set using LassoLarsSCV """ xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) groupnames = [] ydat = [] for g in groups: groupnames.append(getattr(g, 'filename', getattr(g, 'groupname', repr(g)))) val = getattr(g, varname, None) if val is None: raise Value("group '%s' does not have attribute '%s'" % (g, varname)) ydat.append(val) ydat = np.array(ydat) nvals = len(groups) kws.update(dict(fit_intercept=fit_intercept, normalize=normalize)) creator = LassoLars if use_lars else Lasso model = None rmse_cv = None if not skip_cv: if cv_folds is None: cv_folds = int(round(np.sqrt(nvals))) if cv_repeats is None: cv_repeats = int(round(np.sqrt(nvals)) - 1) cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) if alpha is None: lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7, max_iter=1e7, eps=1.e-12, **kws) lcvmod.fit(spectra, ydat) alpha = lcvmod.alpha_ model = creator(alpha=alpha, **kws) resid = [] for ctrain, ctest in cv.split(range(nvals)): model.fit(spectra[ctrain, :], ydat[ctrain]) ypred = model.predict(spectra[ctest, :]) resid.extend((ypred - ydat[ctest]).tolist()) resid = np.array(resid) rmse_cv = np.sqrt( (resid**2).mean() ) if alpha is None: cvmod = creator(**kws) cvmod.fit(spectra, ydat) alpha = cvmod.alpha_ if model is None: model = creator(alpha=alpha, **kws) # final fit without cross-validation out = model.fit(spectra, ydat) ypred = model.predict(spectra) rmse = np.sqrt(((ydat - ypred)**2).mean()) return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred, alpha=alpha, active=model.active_, coefs=model.coef_, cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname, arrayname=arrayname, fit_intercept=fit_intercept, normalize=normalize, groupnames=groupnames, keywords=kws)
class LassoPredictor(Persistent): @contract(hypers='dict') def __init__(self, hypers): modelHypers = self.extract_model_hypers(hypers) self.model = LassoLarsCV(**modelHypers) @timing def fit(self, df, features, targetCol, validationSplit=0.2): print("Running fit function:") print(df) XTrain, yTrain = df2xy(df, features, targetCol) if XTrain.shape[0] < 3: print("not enough data to form a model!") return False success = True try: self.model.fit(XTrain, yTrain) #try: #Parallel(n_jobs=2, verbose=10, batch_size=20)(delayed(self.fit_helper)(date) for date in self.dates) except ValueError: traceback.print_exc() success = False return success def predict(self, df, features, targetCol): XPred, _ = df2xy(df, features, targetCol) try: yPred = self.model.predict(XPred) except ValueError: traceback.print_exc() return None #df['pred' + targetCol] = yPred return yPred #def score (self, userXTest): # # *** Needs reworking! # ''' # :returns: Score calculated by taking the last yTrain (all data) # and comparing to predicted result. # ''' # if self.modelScore is None: # lastDate = self.dates[-1] # actualY = self.yTrains[lastDate] # #preddf = self.predict(userXTest) # preddf = loads(preddf, preserve_order=True) # preddf = pd.DataFrame(preddf['arr'], columns = [self.targetCol]) # predY = preddf[self.targetCol] # predY = predY.shift(-self.batchSize) # predY = predY.iloc[:-self.batchSize] # score = metrics.r2_score(actualY, predY) # self.modelScore = score # else: # score = self.modelScore # return score def lc(self): ''' Makes learning curve for a player ''' if self.lcScores is None: self.lcModel = LassoLarsCV() lastDate = self.dates[-1] X = self.XTrains[lastDate] y = self.yTrains[lastDate] N = len(X) chopOff = N - (N % 7) X = X.iloc[:chopOff] y = y.iloc[:chopOff] idxs = np.arange(chopOff) cvSplits = [(idxs[:i], idxs[i:]) for i in range(7, chopOff, 7)] trainSizes, trainScores, testScores = \ learning_curve(estimator=self.lcModel, X=X.as_matrix(), y=np.array(y), cv=cvSplits, train_sizes=[7], n_jobs=2, ) trainSizes = [len(t[0]) for t in cvSplits] self.lcScores = dumps((trainSizes, trainScores, testScores)) result = self.lcScores else: result = self.lcScores return result def get_params(self): for i, model in self.models.items(): params = order_dict(model.get_params()) break return params def extract_model_hypers(self, hypers): ''' Extracts the parameterse that relevant to the model and are not other meta params ''' params = ['verbose'] modelHypers = {} for param in params: paramVal = hypers.get(param) if paramVal is not None: modelHypers[param] = paramVal modelHypers = order_dict(modelHypers) return modelHypers
predictors['DAYWED']=preprocessing.scale(predictors['DAYWED'].astype('float64')) predictors['FFMC']=preprocessing.scale(predictors['FFMC'].astype('float64')) predictors['DMC']=preprocessing.scale(predictors['DMC'].astype('float64')) predictors['DC']=preprocessing.scale(predictors['DC'].astype('float64')) predictors['ISI']=preprocessing.scale(predictors['ISI'].astype('float64')) predictors['TEMP']=preprocessing.scale(predictors['TEMP'].astype('float64')) predictors['RH']=preprocessing.scale(predictors['RH'].astype('float64')) predictors['WIND']=preprocessing.scale(predictors['WIND'].astype('float64')) predictors['RAIN']=preprocessing.scale(predictors['RAIN'].astype('float64')) # split data into train and test sets pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.3, random_state=123) # specify the lasso regression model model=LassoLarsCV(cv=10, precompute=False).fit(pred_train,tar_train) # print variable names and regression coefficients dict(zip(predictors.columns, model.coef_)) # plot coefficient progression m_log_alphas = -np.log10(model.alphas_) ax = plt.gca() plt.plot(m_log_alphas, model.coef_path_.T) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.ylabel('Regression Coefficients') plt.xlabel('-log(alpha)') plt.title('Regression Coefficients Progression for Lasso Paths') # plot mean square error for each fold
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LarsCV, Lasso, LassoCV, ElasticNet, ElasticNetCV from sklearn.linear_model import LassoLars, LassoLarsCV, Ridge, RidgeCV from sklearn.model_selection import cross_val_score, KFold, GridSearchCV import xgboost as xgb models = [] models.append(("LrE", LinearRegression())) models.append(("RidCV", RidgeCV())) models.append(("LarCV", LarsCV())) models.append(("LasCV", LassoCV())) models.append(("ElNCV", ElasticNetCV())) models.append(("LaLaCV", LassoLarsCV())) models.append(("XGB", xgb.XGBRegressor())) kfold = KFold(n_splits=10) def getCVResult(models, X_learning, Y_learning): for name, model in models: cv_results = cross_val_score(model, X_learning, Y_learning, scoring='neg_mean_squared_error', cv=kfold) rmsd_scores = np.sqrt(-cv_results) print("\n[%s] Mean: %.8f Std. Dev.: %8f" %
ax2 = pl.axes([.08, .5, .05, .47]) cb = pl.colorbar(cax=ax2, ax=ax1) cb.ax.yaxis.set_ticks_position('left') cb.ax.yaxis.set_tick_params(labelcolor='white') cb.ax.yaxis.set_tick_params(labelsize=20) cb.set_ticks(np.arange(0., .8, .2)) pl.savefig(os.path.join('miyawaki', 'encoding_scores.pdf')) pl.savefig(os.path.join('miyawaki', 'encoding_scores.png')) pl.savefig(os.path.join('miyawaki', 'encoding_scores.eps')) pl.clf() ### Compute receptive fields from sklearn.linear_model import LassoLarsCV lasso = LassoLarsCV(max_iter=10,) p = (4, 2) # Mask for chosen pixel pixmask = np.zeros((10, 10), dtype=bool) pixmask[p] = 1 for index in [1780, 1951, 2131, 1935]: rf = lasso.fit(y_train, X_train[:, index]).coef_.reshape(10, 10) pl.figure(figsize=(8, 8)) # Black background pl.imshow(np.zeros_like(rf), vmin=0., vmax=1., cmap='gray') pl.imshow(np.ma.masked_equal(rf, 0.), vmin=0., vmax=0.75, interpolation="nearest", cmap=cm.bluegreen) plot_lines(pixmask, linewidth=6, color='r') pl.axis('off')
def _fit_model(x, y, names, operators, **kw): steps = [("trafo", LibTrafo(names, operators)), ("lasso", LassoLarsCV(**kw))] model = Pipeline(steps).fit(x, y) return model, model.score(x, y)
from sklearn.grid_search import GridSearchCV from sklearn.feature_selection import SelectFromModel from scipy.stats import gmean import xgboost as xgb from xgboost.sklearn import XGBClassifier from xgboost import DMatrix df = pd.read_csv("processed.csv", header=0, index_col="ID") #df.TARGET.describe() y = df["TARGET"].values X = df.ix[:, "var3":"var38"].values X_labels = df.ix[:, "var3":"var38"].columns.values lr = LassoLarsCV() sfm = SelectFromModel(lr, threshold=1e-3) X_std = StandardScaler().fit_transform(X, y) sfm.fit(X_std,y) lr.fit(X_std, y) #feat_imp = pd.DataFrame(lr.coef_, index=X_labels) #feat_imp.plot(kind="bar", title="Feature Importance", use_index=False) chosen_feat = [ f for i,f in enumerate(X_labels) if sfm.get_support()[i] ] #chosen_feat = pickle.load(open("feat", "rb")) print(len(chosen_feat)) chosen_feat # kaggle forum df.var3 = df.var3.replace(-999999,2)
# --------------------------- # Now we take a closer look at the receptive fields of the four marked voxels. # A voxel's `receptive field <http://en.wikipedia.org/wiki/Receptive_field>`_ # is the region of a stimulus (like an image) where the presence of an object, # like a white instead of a black pixel, results in a change in activity # in the voxel. In our case the receptive field is just the vector of 100 # regression coefficients (one for each pixel) reshaped into the 10x10 # form of the original images. Some voxels are receptive to only very few # pixels, so we use `Lasso regression # <http://en.wikipedia.org/wiki/Lasso_(statistics)>`_ to estimate a sparse # set of regression coefficients. from sklearn.linear_model import LassoLarsCV # automatically estimate the sparsity by cross-validation lasso = LassoLarsCV(max_iter=10) # Mark the same pixel in each receptive field marked_pixel = (4, 2) from matplotlib import gridspec from matplotlib.patches import Rectangle fig = plt.figure(figsize=(12, 8)) fig.suptitle('Receptive fields of the marked voxels', fontsize=25) # GridSpec allows us to do subplots with more control of the spacing gs1 = gridspec.GridSpec(2, 3) # we fit the Lasso for each of the three voxels of the upper row for i, index in enumerate([1780, 1951, 2131]):
lambda_ = datas['cat_data']['lambda'] shift = datas['cat_data']['shift'] # models models = {} models["RF"] = GridSearchCV( RFR(n_jobs=-1), param_grid={ "n_estimators": [10, 100, 1000, 10000], "max_features": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] }, cv=5, n_jobs=20) models["LASSO"] = LassoCV(max_iter=100000, cv=5, n_jobs=20) models["RIDGE"] = RidgeCV(cv=5) models["LASSOLARS"] = LassoLarsCV(max_iter=5000, cv=5, n_jobs=-1) models["SVR_POLY2"] = GridSearchCV( SVR(kernel='poly', degree=2), param_grid={ "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], "gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], "epsilon": [0.01, 0.1, 0.5, 1, 2, 4] }, cv=5, n_jobs=20) models["SVR_RBF"] = GridSearchCV( SVR(kernel='rbf'), param_grid={ "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], "gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], "epsilon": [0.01, 0.1, 0.5, 1, 2, 4]
def fit(self, X, y): """ Variable Selection and Prediction. Variable Selection Model: lasso Prediction Models: see self.predict() Parameters ---------- X : numpy array or sparse matrix of shape [n_samples,n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values Returns ------- self : returns an instance of self. """ ################################## ## OLS Train ################################## #ols_train = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_train.fit(X, y) #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2) """ fit_intercept=True, center the data copy=True, because centering data invovles X -= X_mean CAUTION: normalization=False, otherwise involves taking squares of X, lose precision self.rss_ols_train.shape = (1,1) """ ################################## ## Pre Variable Selection Predictions ################################## self.pre_pred = False if self.pre_pred: print "Computing ... " param_ridge_pre = list(np.arange(1e9,2e9,1e8)) self.pls_pre, self.ridge_pre = \ self.run_models(X, y, param_ridge_pre) ################################## ## Lasso Variable Selection ################################## self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000, eps= 2.2204460492503131e-16,copy_X=True, cv=self.cv, n_jobs=self.n_jobs) self.lasso_cv.fit(X, y) """ normalize=True, lasso seems to be able to handle itself """ if self.rlasso_selection_threshold == 0: self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_, fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, eps=2.2204460492503131e-16, copy_X=True, fit_path=False) self.lasso_refit.fit(X, y) self.active = self.lasso_refit.coef_ != 0 self.active = self.active[0,:] X_selected = X[:, self.active] else: self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=self.rlasso_selection_threshold, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',) self.rlasso.fit(X, y) X_selected = self.rlasso.transform(X) ################################## ## Post Variable Selection Predictions ################################## self.pls_post, self.ridge_post = \ self.run_models(X_selected, y, self.param_ridge_post) return self
#!/usr/bin/env python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LassoLarsCV from sklearn import datasets from sklearn.utils import shuffle import numpy as np boston = datasets.load_boston() X, Y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) offset = int(X.shape[0] * 0.9) X_train, Y_train = X[:offset], Y[:offset] X_test, Y_test = X[offset:], Y[offset:] regressor = LassoLarsCV(cv=15) regressor.fit(X_train, Y_train) score = regressor.score(X_test, Y_test) print(score)
prediction = grid_search.predict(validation_features) dio.save_prediction(model_name, prediction, type_v) parameters = { "alpha": [0.1, 1, 10], } make_grid_search(Ridge(tol=1e-2, solver="lsqr"), parameters, "Ridge_tfidf_05d", param) #make_grid_search(Lasso(), parameters, "Lasso_tfidf_05d", param) #make_grid_search(LassoLars(), parameters, "LassoLars_tfidf_05d", param) a = 5 / 0 benchmark(LassoCV(max_iter=100, verbose=1)) benchmark(LassoLarsCV(n_jobs=-1, max_iter=100, max_n_alphas=50, verbose=1)) n_trees = 20 min_samples_split = 2 name = "ExtraTrees_min_sample%d_%dtrees_tfidf-05d_BoW-titleFullRaw-AllColumns_new_log" % ( min_samples_split, n_trees) classifier = ExtraTreesRegressor( n_estimators=n_trees, #classifier = RandomForestRegressor(n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=True, min_samples_split=min_samples_split, random_state=3465343) classifier.fit(features, salaries) #classifier = dio.load_model(name)
model_ridge.fit(train_X, train_y) print('训练集预测的确定系数R ^ 2: ', model_ridge.score(train_X, train_y)) print('验证集预测的确定系数R ^ 2: ', model_ridge.score(test_X, test_y)) pred_1 = model_ridge.predict(test_X) print('模型误差: ', mean_squared_error(test_y, pred_1)) # 通过RidgeCV可以设置多个参数值,算法使用交叉验证获取最佳参数 model = RidgeCV(alphas=[0.001, 0.01, 0.1, 1.0]) model.fit(train_X, train_y) print("模型参数:", model.get_params()) print("模型详情:", model) print('最佳alpha', model.alpha_) # Ridge()无这个方法,只有RidgeCV算法有 print('训练集预测的确定系数R ^ 2: ', model.score(train_X, train_y)) print('验证集预测的确定系数R ^ 2: ', model.score(test_X, test_y)) pred_2 = model.predict(test_X) print('Ridge模型误差: ', mean_squared_error(test_y, pred_2)) # Lasso回归 model_lasso = Lasso(alpha=0.01) model_lasso = LassoCV() model_lasso = LassoLarsCV() model_lasso.fit(train_X, train_y) print("模型参数:", model_lasso.get_params()) print("模型详情:", model_lasso) #print('最佳alpha',model_lasso.alpha_) print('训练集预测的确定系数R ^ 2: ', model_lasso.score(train_X, train_y)) print('验证集预测的确定系数R ^ 2: ', model_lasso.score(test_X, test_y)) pred_3 = model_lasso.predict(test_X) print('Lasso模型误差: ', mean_squared_error(test_y, pred_3))
predictors['ESTEEM1']=preprocessing.scale(predictors['ESTEEM1'].astype('float64')) predictors['VIOL1']=preprocessing.scale(predictors['VIOL1'].astype('float64')) predictors['PASSIST']=preprocessing.scale(predictors['PASSIST'].astype('float64')) predictors['DEVIANT1']=preprocessing.scale(predictors['DEVIANT1'].astype('float64')) predictors['GPA1']=preprocessing.scale(predictors['GPA1'].astype('float64')) predictors['EXPEL1']=preprocessing.scale(predictors['EXPEL1'].astype('float64')) predictors['FAMCONCT']=preprocessing.scale(predictors['FAMCONCT'].astype('float64')) predictors['PARACTV']=preprocessing.scale(predictors['PARACTV'].astype('float64')) predictors['PARPRES']=preprocessing.scale(predictors['PARPRES'].astype('float64')) # split data into train and test sets pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.3, random_state=123) # specify the lasso regression model model=LassoLarsCV(cv=10, precompute=False).fit(pred_train,tar_train) # print variable names and regression coefficients coef = dict(zip(predictors.columns, model.coef_)) #sort by value import operator sorted(coef.items(), key=operator.itemgetter(1), reverse=True) #most significants + are ESTEEM1, GPA1, FAMCONCT #most significants - are DEP1, BLACK, VIOL1 # plot coefficient progression # show the order of selected cofficient and its value when new predictors are added m_log_alphas = -np.log10(model.alphas_) #alpha = penalty parameter = lambda through the model selection process ax = plt.gca() plt.plot(m_log_alphas, model.coef_path_.T) #.T = transpose
models_out = pd.DataFrame(columns=columns) # Fit models for each season separately for l, ssn in enumerate(data['seasons']): print('Training years:', data['trn_yrs']) n_estimators = data['n_smpls'] # Indexes for selecting data from the training period trn_idx = fcts.bool_index_to_int_index( np.isin(data['Y']['time.season'], ssn) & np.isin(data['Y']['time.year'], data['trn_yrs'])) # Fit LassoLarsCV models using the handy BaggingRegressor meta-estimator cv = KFold(n_splits=5, shuffle=True) base_estimator = LassoLarsCV(eps=2e-10, max_iter=200, cv=cv, n_jobs=1) ensemble = fcts.bagging_metaestimator( data['X'].values[trn_idx], data['Y'][data['y_var']].values[trn_idx], data['vrbl_names'], data['n_smpls'], data['p_smpl'], data['p_feat'], data['n_jobs'], base_estimator) # Append the models to the output table, including also the season information for i, mdl in enumerate(ensemble.estimators_[:n_estimators]): feature_idxs = ensemble.estimators_features_[i] posit_features = np.abs(mdl.coef_) > 0 feature_names = list(data['vrbl_names'][feature_idxs][posit_features]) n_features = len(feature_names) fcs = mdl.predict(data['X'].values[trn_idx][:, feature_idxs]) obs = data['Y'][data['y_var']].values[trn_idx] train_period_acc = fcts.calc_corr(fcs, obs) df = pd.DataFrame([[
mu = np.repeat(0, 100) dists = np.arange(100) powers = [[np.abs(i-j) for j in dists] for i in dists] r = np.power(.5, powers) X = np.random.multivariate_normal(mu, r, size=50) y = 7*X[:, 0] + \ 5*X[:, 10] + \ 3*X[:, 20] + \ 1*X[:, 30] + \ .5*X[:, 40] + \ .2*X[:, 50] + \ np.random.normal(0, 2, 50) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20) lasso = LassoLarsCV(cv=5).fit(X_train, y_train) alpha = lasso.alpha_ # For testing when X input has a single feature Xa, ya = make_regression(n_samples=50, n_features=1, random_state=0, coef=False) # For testing when y output vector is multidimensionnal Xb, yb = make_regression(n_samples=50, n_features=10, n_informative=3, n_targets=2, noise=2, random_state=0,
value: float Returns a float in the range (0., 1.) """ try: value = float(value) except: raise argparse.ArgumentTypeError( 'Invalid float value: \'{}\''.format(value)) if value < 0.0 or value > 1.0: raise argparse.ArgumentTypeError( 'Invalid float value: \'{}\''.format(value)) return value # dictionary of ml options ml_dict = { 'lasso': LassoLarsCV(), 'svr': SVR(), 'lsvr': LinearSVR(), 'lr': LogisticRegression(solver='sag'), 'sgd': SGDClassifier(loss='log',penalty='l1'), 'svc': SVC(), 'lsvc': LinearSVC(), 'rfc': RandomForestClassifier(), 'rfr': RandomForestRegressor(), 'dtc': DecisionTreeClassifier(), 'dtr': DecisionTreeRegressor(), 'dc': DistanceClassifier(), 'knc': KNeighborsClassifier(), 'knr': KNeighborsRegressor(), None: None }
import numpy as np import pandas as pd from sklearn.linear_model import LassoLarsCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from tpot.builtins import ZeroCount # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('../../input/train.csv', delimiter=',', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline(ZeroCount(), LassoLarsCV(normalize=True)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def __init__(self, population_size=50, generations=100, mutation_rate=0.5, crossover_rate=0.5, ml = None, min_depth = 1, max_depth = 2, max_depth_init = 2, sel = 'epsilon_lexicase', tourn_size = 2, fit_choice = None, op_weight = False, max_stall=100, seed_with_ml = True, erc = False, random_state=None, verbosity=0, scoring_function=None, disable_update_check=False, elitism=True, boolean = False,classification=False,clean=False, track_diversity=False,mdr=False,otype='f',c=True, weight_parents=True): # sets up GP. # Save params to be recalled later by get_params() self.params = locals() # placed before any local variable definitions self.params.pop('self') # # Do not prompt the user to update during this session if they # ever disabled the update check # if disable_update_check: # FEW.update_checked = True # # # Prompt the user if their version is out of date # if not disable_update_check and not FEW.update_checked: # update_check('FEW', __version__) # FEW.update_checked = True self._best_estimator = None self._training_features = None self._training_labels = None self._best_inds = None self.population_size = population_size self.generations = generations self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate self.min_depth = min_depth self.max_depth = max_depth self.max_depth_init = max_depth_init self.sel = sel self.tourn_size = tourn_size self.fit_choice = fit_choice self.op_weight = op_weight self.max_stall = max_stall self.weight_parents = weight_parents self.seed_with_ml = seed_with_ml self.erc = erc self.random_state = check_random_state(random_state) self.verbosity = verbosity self.scoring_function = scoring_function self.gp_generation = 0 self.elitism = elitism self.max_fit = 99999999.666 self.boolean = boolean self.classification = classification self.clean = clean self.ml = Pipeline([('standardScaler',StandardScaler()), ('ml', ml)]) self.ml_type = type(self.ml.named_steps['ml']).__name__ self.track_diversity = track_diversity self.mdr = mdr self.otype = otype # if otype is b, boolean functions must be turned on if self.otype=='b': self.boolean = True # instantiate sklearn estimator according to specified machine learner if self.ml.named_steps['ml'] is None: if self.classification: self.ml = Pipeline([('standardScaler',StandardScaler()), ('ml',LogisticRegression(solver='sag'))]) else: self.ml = Pipeline([('standardScaler',StandardScaler()), ('ml',LassoLarsCV())]) if not self.scoring_function: if self.classification: self.scoring_function = accuracy_score else: self.scoring_function = r2_score # set default fitness metrics for various learners if not self.fit_choice: tmp_dict = defaultdict(lambda: 'r2', { #regression type(LassoLarsCV()): 'mse', type(SVR()): 'mae', type(LinearSVR()): 'mae', type(KNeighborsRegressor()): 'mse', type(DecisionTreeRegressor()): 'mse', type(RandomForestRegressor()): 'mse', #classification type(DistanceClassifier()): 'silhouette', }) self.fit_choice = tmp_dict[type(self.ml.named_steps['ml'])] # Columns to always ignore when in an operator self.non_feature_columns = ['label', 'group', 'guess'] # function set self.func_set = [node('+'), node('-'), node('*'), node('/'), node('sin'), node('cos'), node('exp'), node('log'), node('^2'), node('^3'), node('sqrt')] # terminal set self.term_set = [] # diversity self.diversity = [] # use cython self.c = c
def __init__(self, hypers): modelHypers = self.extract_model_hypers(hypers) self.model = LassoLarsCV(**modelHypers)
def fit(self, X, y): from sklearn.linear_model import LassoLarsCV self.estimator = LassoLarsCV(cv=5) self.estimator.fit(X, y) return self
dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-17.53982123860686 exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=make_pipeline( StackingEstimator( estimator=ElasticNetCV(l1_ratio=0.55, tol=0.001)), StackingEstimator(estimator=GradientBoostingRegressor( alpha=0.8, learning_rate=1.0, loss="lad", max_depth=4, max_features=0.7000000000000001, min_samples_leaf=11, min_samples_split=20, n_estimators=100, subsample=0.1)), LassoLarsCV(normalize=True))), FunctionTransformer(copy)), LinearSVR(C=5.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=0.1)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def QuickML_Ensembling(X_train, y_train, X_test, y_test='', modeltype='Regression', Boosting_Flag=False, scoring='', verbose=0): """ Quickly builds and runs multiple models for a clean data set(only numerics). """ start_time = time.time() seed = 99 FOLDS = 5 model_dict = {} model_tuples = [] if len(X_train) <= 100000 or X_train.shape[1] < 50: NUMS = 100 else: try: X_train = X_train.sample(frac=0.30,random_state=99) y_train = y_train[X_train.index] except: pass NUMS = 200 if modeltype == 'Regression': if scoring == '': scoring = 'neg_mean_squared_error' scv = ShuffleSplit(n_splits=FOLDS,random_state=seed) if Boosting_Flag is None: ## Create an ensemble model #### model5 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( random_state=seed, max_depth=1, min_samples_leaf=2 ), n_estimators=NUMS, random_state=seed) model_tuples.append(('Adaboost',model5)) elif not Boosting_Flag: model5 = LassoLarsCV(cv=scv) model_tuples.append(('LassoLarsCV',model5)) else: model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS,random_state=seed) model_tuples.append(('Bagging',model5)) if Boosting_Flag is None: model6 = DecisionTreeRegressor(max_depth=5,min_samples_leaf=2) model_tuples.append(('Decision_Tree',model6)) elif not Boosting_Flag: model6 = LinearSVR() model_tuples.append(('Linear_SVR',model6)) else: model6 = DecisionTreeRegressor(max_depth=5,min_samples_leaf=2) model_tuples.append(('Decision_Tree',model6)) sgd_best_model = SGDRegressor(alpha=1e-06, loss='squared_loss', max_iter=1000, penalty='l2', learning_rate = 'constant', eta0 = .1, random_state = 3, tol=None) model7 = BaggingRegressor(sgd_best_model) model_tuples.append(('SGD_Regressor',model7)) if Boosting_Flag is None: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here model8 = RandomForestRegressor(bootstrap = False, max_depth = 10, max_features = 'auto', min_samples_leaf = 2, n_estimators = 200, random_state=99) model_tuples.append(('Bagging_Regressor',model8)) elif not Boosting_Flag: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( random_state=seed, max_depth=1, min_samples_leaf=2 ), n_estimators=NUMS, random_state=seed) model_tuples.append(('Adaboost',model8)) else: model8 = RandomForestRegressor(bootstrap = False, max_depth = 10, max_features = 'auto', min_samples_leaf = 2, n_estimators = 200, random_state=99) model_tuples.append(('Bagging_Regressor',model8)) else: if scoring == '': scoring = 'accuracy' scv = StratifiedKFold(n_splits=FOLDS,random_state=seed) if Boosting_Flag is None: ## Create an ensemble model #### model5 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( random_state=seed, max_depth=1, min_samples_leaf=2 ), n_estimators=NUMS, random_state=seed) model_tuples.append(('Adaboost',model5)) elif not Boosting_Flag: model5 = LinearDiscriminantAnalysis() model_tuples.append(('Linear_Discriminant',model5)) else: model5 = LogisticRegression(C=0.01,solver='liblinear', random_state=seed) model_tuples.append(('Logistic_Regression_Model',model5)) if Boosting_Flag is None: model6 = DecisionTreeClassifier(max_depth=5,min_samples_leaf=2) model_tuples.append(('Decision_Tree',model6)) elif not Boosting_Flag: model6 = LinearSVC() model_tuples.append(('Linear_SVC',model6)) else: model6 = DecisionTreeClassifier(max_depth=5,min_samples_leaf=2) model_tuples.append(('Decision_Tree',model6)) if modeltype == 'Binary_Classification': model7 = GaussianNB() else: model7 = MultinomialNB() model_tuples.append(('Naive_Bayes',model7)) if Boosting_Flag is None: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here model8 = RandomForestClassifier(bootstrap = False, max_depth = 10, max_features = 'auto', min_samples_leaf = 2, n_estimators = 200, random_state=99) model_tuples.append(('Bagging_Classifier',model8)) elif not Boosting_Flag: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here sgd_best_model = SGDClassifier(alpha=1e-06, loss='log', max_iter=1000, penalty='l2', learning_rate = 'constant', eta0 = .1, random_state = 3, tol=None) model8 = OneVsRestClassifier(sgd_best_model) model_tuples.append(('One_vs_Rest_Classifier',model8)) else: model8 = RandomForestClassifier(bootstrap = False, max_depth = 10, max_features = 'auto', min_samples_leaf = 2, n_estimators = 200, random_state=99) model_tuples.append(('Bagging_Classifier',model8)) model_dict = dict(model_tuples) models, results = run_ensemble_models(model_dict, X_train, y_train, X_test, y_test, scoring, modeltype) return models, results
RobustScaler(), MinMaxScaler(), StackingEstimator(estimator=LinearSVR(C=25.0, dual=True, epsilon=0.01, loss="epsilon_insensitive", tol=0.0001)), StackingEstimator(estimator=DecisionTreeRegressor( max_depth=8, min_samples_leaf=17, min_samples_split=9)), FeatureAgglomeration(affinity="l2", linkage="average"), RBFSampler(gamma=0.75), StackingEstimator(estimator=LinearSVR(C=1.0, dual=True, epsilon=1.0, loss="squared_epsilon_insensitive", tol=0.1)), StackingEstimator( estimator=KNeighborsRegressor(n_neighbors=9, p=1, weights="uniform")), StackingEstimator(estimator=LassoLarsCV(normalize=True)), SelectPercentile(score_func=f_regression, percentile=26), StandardScaler(), PCA(iterated_power=7, svd_solver="randomized"), StackingEstimator(estimator=LinearSVR(C=10.0, dual=True, epsilon=0.01, loss="squared_epsilon_insensitive", tol=1e-05)), ZeroCount(), SelectFwe(score_func=f_regression, alpha=0.039), PCA(iterated_power=5, svd_solver="randomized"), RidgeCV()) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
#分别计算其均方根误差和拟合优度 y_train_rmse = sqrt(metrics.mean_squared_error(y_train, y_train_pred)) y_train_score = rd.score(x_train, y_train) y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred)) y_test_score = rd.score(x_test, y_test) print('训练集RMSE: {0}, R方: {1}'.format(y_train_rmse, y_train_score)) print('测试集RMSE: {0}, R方: {1}'.format(y_test_rmse, y_test_score)) '''========9.Lasso回归========''' import numpy as np import matplotlib.pyplot as plt # 可视化绘制 from sklearn.linear_model import Lasso, LassoCV, LassoLarsCV # Lasso回归,LassoCV交叉验证实现alpha的选取,LassoLarsCV基于最小角回归交叉验证实现alpha的选取 #model = Lasso(alpha=0.01) # 调节alpha可以实现对拟合的程度 # model = LassoCV() # LassoCV自动调节alpha可以实现选择最佳的alpha。 model = LassoLarsCV() # LassoLarsCV自动调节alpha可以实现选择最佳的alpha model.fit(x_train, y_train) # 线性回归建模 print('系数矩阵:\n', model.coef_, model.intercept_) print('线性回归模型:\n', model) print('最佳的alpha:', model.alpha_) # 只有在使用LassoCV、LassoLarsCV时才有效 # 使用模型预测 #分别预测训练数据和测试数据 y_train_pred = model.predict(x_train) y_test_pred = model.predict(x_test) #分别计算其均方根误差和拟合优度 y_train_rmse = sqrt(metrics.mean_squared_error(y_train, y_train_pred)) y_train_score = model.score(x_train, y_train) y_test_rmse = sqrt(metrics.mean_squared_error(y_test, y_test_pred))
import numpy as np import pandas as pd from sklearn.linear_model import LassoLarsCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MinMaxScaler, StandardScaler # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:-1.758311648032997e-26 exported_pipeline = make_pipeline(MinMaxScaler(), StandardScaler(), StandardScaler(), LassoLarsCV(normalize=True)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
X_trainset_001.append(X_trainset[i]) y_trainset_001.append(1.0) num_2 += 1 print num_1, num_2 classify_model_001 = RandomForestClassifier(n_estimators=55, random_state=1) classify_model_001.fit(X_trainset_001, y_trainset_001) ### 构建0.003的回归模型 from sklearn.linear_model import LassoLarsCV, BayesianRidge X_trainset_0003 = [] y_trainset_0003 = [] for i in range(0, y_trainset.__len__(), 1): if y_trainset[i] < 0.003: X_trainset_0003.append(X_trainset[i]) y_trainset_0003.append(y_trainset[i]) reg_0003 = LassoLarsCV(max_n_alphas=100, positive=True) reg_0003.fit(X_trainset_0003, y_trainset_0003) ### 构建0.003-0.01的回归模型 from sklearn.linear_model import LassoLarsCV X_trainset_001 = [] y_trainset_001 = [] for i in range(0, y_trainset.__len__(), 1): if y_trainset[i] >= 0.003 and y_trainset[i] < 0.015: X_trainset_001.append(X_trainset[i]) y_trainset_001.append(y_trainset[i]) reg_001 = LassoLarsCV(max_n_alphas=100, cv=10) reg_001.fit(X_trainset_001, y_trainset_001) ### 构建大于0.01的回归模型 from sklearn.linear_model import BayesianRidge, RANSACRegressor, RidgeCV, Ridge, LassoLarsCV
#print(LogReg.coef_) #print(icu.head()) ############# LASSO ############## predvar = icu.copy() target = predvar.STA predictors = predvar[[ 'AGE', 'SYS', 'HRA', 'RACE_1', 'RACE_2', 'RACE_3', 'CPR_1', 'TYP_1' ]].copy() for i in list(predictors.columns.values): predictors[i] = preprocessing.scale(predictors[i].astype('float64')) pred_train, pred_test, resp_train, resp_test = train_test_split( predictors, target, test_size=.3, random_state=123) model = LassoLarsCV(cv=10, precompute=True).fit(pred_train, resp_train) dict(zip(predictors.columns, model.coef_)) m_log_alphascv = -np.log10(model.cv_alphas_) plt.figure() plt.plot(m_log_alphascv, model.mse_path_, ':') plt.plot(m_log_alphascv, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend()
model.fit(X, y) # 线性回归建模 print('系数:\n', model.coef_) print('线性回归模型详情:\n', model) print('交叉验证最佳alpha值', model.alpha_) # Ridge()无这个方法,只有RidgeCV算法有 pred_2 = model.predict(X) # 绘制散点图 plt.scatter(X, y, marker='x') plt.plot(X, pred_1, c='r') plt.xlabel("x") plt.ylabel("y") plt.show() # Lasso回归 model = Lasso(alpha=0.01) # 调节alpha可以实现对拟合的程度 model = LassoCV() # LassoCV自动调节alpha可以实现选择最佳的alpha。 model = LassoLarsCV() # LassoLarsCV自动调节alpha可以实现选择最佳的alpha model.fit(X, y) # 线性回归建模 print('系数:\n', model.coef_) print('线性回归模型详情:\n', model) # print('最佳的alpha:',model.alpha_) # 只有在使用LassoCV、LassoLarsCV时才有效 pred = model.predict(X) # 绘制散点图 plt.scatter(X, y, marker='x') plt.plot(X, pred, c='r') plt.xlabel("x") plt.ylabel("y") plt.show()
hg = pl.plot(alpha_grid[1:]**.333, scores_path[coef != 0].T[1:], 'r') hb = pl.plot(alpha_grid[1:]**.333, scores_path[coef == 0].T[1:], 'k') ymin, ymax = pl.ylim() pl.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$') pl.ylabel('Stability score: proportion of times selected') pl.title('Stability Scores Path - Mutual incoherence: %.1f' % mi) pl.axis('tight') pl.legend((hg[0], hb[0]), ('relevant features', 'irrelevant features'), loc='best') ########################################################################### # Plot the estimated stability scores for a given alpha # Use 6-fold cross-validation rather than the default 3-fold: it leads to # a better choice of alpha: lars_cv = LassoLarsCV(cv=6).fit(X, y) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y) trees = ExtraTreesRegressor(100, compute_importances=True).fit(X, y) # Compare with F-score F, _ = f_regression(X, y) pl.figure() for name, score in [ ('F-test', F), ('Stability selection', clf.scores_), ('Lasso coefs', np.abs(lars_cv.coef_)),
parser = argparse.ArgumentParser() parser.add_argument("--lat", help="Training Latitude", type=float) parser.add_argument("--lon", help="Training Longitude", type=float) args = parser.parse_args() train_data = load_data.load_supervised(1950, 1985, args.lat, args.lon, 50, which='train') test_data = load_data.load_supervised(1986, 1999, args.lat, args.lon, 50, which='test') lasso_file = os.path.join(os.path.dirname(__file__), "models/lasso_%2.2f_%2.2f.pkl" % (args.lat, args.lon)) if os.path.exists(lasso_file): print "Reading PCA from file" L = pickle.load(open(lasso_file, 'r')) else: print "Fitting Lasso" L = LassoLarsCV(cv=5) L.fit(train_data.X, train_data.y[:,0]) pickle.dump(L, open(lasso_file, 'w')) ## Print Fit stats print "Alpha", L.alpha_ print "Training Pearson Corr:", pearsonr(train_data.y[:,0], L.predict(train_data.X)) print "Training Spearman Corr:", spearmanr(train_data.y[:,0], L.predict(train_data.X)) yhat = L.predict(test_data.X) print "Pearson Corr", pearsonr(test_data.y[:,0], yhat) print "Spearman Corr", spearmanr(test_data.y[:,0], yhat) print "SSE", sum((yhat - test_data.y[:,0])**2)
X_aging, y_aging = aging[col], aging[interest] X_sa, y_sa = superagers[col], superagers[interest] X_mci, y_mci = mci[col], mci[interest] X_train, y_train = train_set[col], train_set[interest] X_test, y_test = test_set[col], test_set[interest] score = 'mean_squared_error' tuned_params_lasso = [{'alpha': np.linspace(-1, 1, 100), 'normalize': [True, False]}] ### ACROSS WHOLE DATASET ### With StratifiedKFold, we're stratifying according to the interest variable. ### This ensures that there will be an even proportion of RAVLT_DEL (or whatever ### the interest variable is) values across all folds. skf = cross_validation.StratifiedKFold(y_aging, n_folds=6) model = LassoLarsCV(max_iter=100000, cv=skf).fit( X_aging, y_aging ) # print("Best estimator for WHOLE DATASET: \n{0}\n".format(model.best_estimator_)) print("Percent variance explained: {0}".format(model.score( X_aging, y_aging))) print("Coefficients found: \n{0}\n".format(prettyprint(model.coef_, col, sort=True))) # plot coefficient progression m_log_alphas = -np.log10(model.alphas_) ax = plt.gca() plt.plot(m_log_alphas, model.coef_path_.T) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.ylabel('Regression Coefficients') plt.xlabel('-log(alpha)') plt.title('Regression Coefficients Progression for Lasso Paths')
store_csv(mpg, name) if "Auto" in datasets: build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto") build_auto(ARDRegression(normalize = True), "BayesianARDAuto") build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = False) build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(), "LarsAuto") build_auto(LassoCV(random_state = 13), "LassoAuto") build_auto(LassoLarsCV(), "LassoLarsAuto") build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", num_iteration = 11) build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto") build_auto(OrthogonalMatchingPursuitCV(), "OMPAuto") build_auto(RandomForestRegressor(random_state = 13, min_samples_leaf = 3), "RandomForestAuto", flat = True) build_auto(RidgeCV(), "RidgeAuto") build_auto(TheilSenRegressor(n_subsamples = 15, random_state = 13), "TheilSenAuto") build_auto(OptimalXGBRegressor(objective = "reg:linear", ntree_limit = 31), "XGBAuto", ntree_limit = 31) if "Auto" in datasets: build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto") build_auto(TransformedTargetRegressor(LinearRegression(), func = numpy.log, inverse_func = numpy.exp), "TransformedLinearRegressionAuto") def build_auto_h2o(regressor, name): transformer = ColumnTransformer(
class LinearAll: """ A repertoire of Linear Variable Selection and Prediction Models Parameters ---------- n_jobs : int, optional Number of jobs to run in parallel (default 1). If -1 all CPUs are used. This will only provide speedup for n_targets > 1 and sufficient large problems pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs An int, giving the exact number of total jobs that are spawned A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’ refit : boolean Refit the best estimator with the entire dataset. If “False”, it is impossible to make predictions using this GridSearchCV instance after fitting. iid : boolean, optional If True, the data is assumed to be identically distributed across the folds, and the score is computed from all samples individually, and not the mean loss across the folds. (If the number of data points is the same across folds, either returns the same thing) Attributes ---------- ols_train, predictions models before variable selection predictions models after variable selection """ def __init__ (self, cv=20, scoring = 'mean_squared_error', n_jobs=1, refit=False, iid=False, pre_pred=True, param_ridge_post=list(np.arange(1,3,0.1)), rlasso_selection_threshold = 0.5): #self.__name__ = '__main__' """ CAUTION: we changed to __main__ so that parallelization works """ self.cv = cv self.scoring = scoring self.n_jobs = n_jobs self.refit = refit self.iid = iid self.pre_pred =pre_pred self.param_ridge_post = param_ridge_post self.rlasso_selection_threshold = rlasso_selection_threshold def run_models(self, X, y, param_ridge): """ Prediction Models. OLS, PLS, Ridge """ ################################## ## OLS CV ################################## #ols = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_cv_score = cross_validation.cross_val_score( # ols, X, y, # cv=self.cv, scoring=self.scoring, # n_jobs=self.n_jobs) """ self.ols_cv_score.shape = (cv,) """ ################################## ## PLS CV ################################## tuned_parameters = [{'n_components': range(1, 5)}] pls = PLSRegression() pls_cv = GridSearchCV(pls, tuned_parameters, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, iid=self.iid) pls_cv.fit(X, y) ################################## ## Ridge CV ################################## tuned_parameters = [{'alpha': param_ridge}] ridge = linear_model.Ridge(alpha = 1) ridge_cv = GridSearchCV(ridge, tuned_parameters, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, iid=self.iid) ridge_cv.fit(X, y) return (pls_cv, ridge_cv) def fit(self, X, y): """ Variable Selection and Prediction. Variable Selection Model: lasso Prediction Models: see self.predict() Parameters ---------- X : numpy array or sparse matrix of shape [n_samples,n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values Returns ------- self : returns an instance of self. """ ################################## ## OLS Train ################################## #ols_train = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_train.fit(X, y) #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2) """ fit_intercept=True, center the data copy=True, because centering data invovles X -= X_mean CAUTION: normalization=False, otherwise involves taking squares of X, lose precision self.rss_ols_train.shape = (1,1) """ ################################## ## Pre Variable Selection Predictions ################################## self.pre_pred = False if self.pre_pred: print "Computing ... " param_ridge_pre = list(np.arange(1e9,2e9,1e8)) self.pls_pre, self.ridge_pre = \ self.run_models(X, y, param_ridge_pre) ################################## ## Lasso Variable Selection ################################## self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000, eps= 2.2204460492503131e-16,copy_X=True, cv=self.cv, n_jobs=self.n_jobs) self.lasso_cv.fit(X, y) """ normalize=True, lasso seems to be able to handle itself """ if self.rlasso_selection_threshold == 0: self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_, fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, eps=2.2204460492503131e-16, copy_X=True, fit_path=False) self.lasso_refit.fit(X, y) self.active = self.lasso_refit.coef_ != 0 self.active = self.active[0,:] X_selected = X[:, self.active] else: self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=self.rlasso_selection_threshold, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',) self.rlasso.fit(X, y) X_selected = self.rlasso.transform(X) ################################## ## Post Variable Selection Predictions ################################## self.pls_post, self.ridge_post = \ self.run_models(X_selected, y, self.param_ridge_post) return self def predict(self, X_test): assert(self.refit == True) if self.pls_post.best_score_ > self.ridge_post.best_score_: self.best_model = self.pls_post print "Chosen Model: pls" else: self.best_model = self.ridge_post print "Chosen Model: ridge" if self.rlasso_selection_threshold == 0: X_test_selected = X_test[:, self.active] else: X_test_selected = self.rlasso.transform(X_test) return self.best_model.best_estimator_.predict(X_test_selected)
block = delay * num_filter chan = num / block f = (num % block) / delay t = (num % block) % delay return (chan, f, t) if __name__ == "__main__": os.chdir(os.path.dirname(__file__)) subj = 'sub1' finger = 1 with h5py.File('ECoG_data.h5', 'r+') as f: u = f[subj]['unmixing_matrix'][:] X = f[subj]['train_data'][:] X -= X.mean(0) X = X.dot(u) Y = f[subj]['cleaned_train_dg'][:] X1, y1, _ = preprocessing(X, Y[:, finger]) ls = LassoLarsCV() ls.fit(X1, y1[:, 0]) pickle.dump(ls, open('linear_model_'+subj+'_'+str(finger), 'wb')) channel_count = Counter([num2info(c)[0] for c in ls.coef_.nonzero()[0]]) X2, _, yb = preprocessing(X[:, list(set(channel_count.keys()))], Y[:, finger]) ls2 = LogisticRegressionCV() ls2.fit(X2, yb[:, 0]) pickle.dump(ls2, open('logistic_model_'+subj+'_'+str(finger), 'wb')) with h5py.File('selected_channel.h5', 'w') as f: f.create_dataset('selected_channel', data=list(set(channel_count.keys())))