def train_model_cv(self, train_file, normalize, is_bool_value, is_percentage, cv=10, save_model=False): # training features_array, label_array, feature_names = self.get_features_array_label_array_from_file( train_file, normalize=normalize, is_bool_value=is_bool_value, is_percentage=is_percentage) # TODO: you can change the model here. Now we are using 10-cross valication for the model. # self.model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) # self.model = linear_model.Lasso(alpha = 0.1) self.model = linear_model.LassoCV(cv=cv, normalize=False, verbose=True, max_iter=10000) print("Model Settings:", self.model) self.model.fit(features_array, label_array) self.print_linear_regression_formular(feature_names)
def model_lasso(s, t, s_, t_, flagCV, nFeat): if flagCV: #bad r2 #http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV clf = sklm.LassoCV(eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, \ precompute='auto', max_iter=1000, tol=0.0001, copy_X=True, cv=None, \ verbose=False, n_jobs=1, positive=False, random_state=None, \ selection='cyclic') else: #http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso clf = sklm.Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, \ copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False,\ random_state=None, selection='cyclic') clf.fit(s, t) print 'coeffs = ', clf.coef_, ' intercept = ', clf.intercept_ feature_imp = clf.coef_ feature_imp_ind_neg = feature_imp.argsort()[0:nFeat] feature_imp_ind_pos = feature_imp.argsort()[-nFeat:][::-1] print 'feature_imp_ind_neg=', feature_imp_ind_neg, 'feature_imp_ind_pos=', feature_imp_ind_pos r2_train = clf.score(s, t) r2_test = clf.score(s_, t_) print 'r2_train=', r2_train, ' r2_test=', r2_test
def franke_lasso(n=10000, eps=0.0): max_order = 5 x = rng.uniform(size=n) y = rng.uniform(size=n) err = eps * rng.normal(size=n) x_train = x[:int(n / 2)] y_train = y[:int(n / 2)] err_train = err[:int(n / 2)] x_valid = x[int(n / 2):] y_valid = y[int(n / 2):] err_valid = err[:int(n / 2)] z_train = FrankeFunction(x_train, y_train) + err_train z_valid = FrankeFunction(x_valid, y_valid) + err_valid xb = np.column_stack(sol_tup(x_train, y_train, max_order)) lasso = linear_model.LassoCV(max_iter=100000, cv=5) lasso.fit(xb, z_train) predl = lasso.predict(np.column_stack(sol_tup(x_valid, y_valid, max_order))) return lasso.coef_, mean_squared_error(z_valid, predl), r2_score(z_valid, predl)
def fs_lasso_cv(X, y, feat_list, n_alphas=1000, cv=10, tol=0.00001, max_iter=10000, hard_shrink=None): '''Wrapper function to build a LassoCV model from sklearn and return important features''' lcv = linear_model.LassoCV(n_jobs=max(1, mp.cpu_count() - 1), n_alphas=n_alphas, cv=cv, tol=tol, max_iter=max_iter) coefs = lcv.fit(X, y).coef_ # force shrinkage to zero if hard_shrink is provided if hard_shrink is not None: np.place(coefs, np.abs(coefs) < hard_shrink, 0) selected_feats = list(it.compress(feat_list, coefs)) return selected_feats
def lasso_regression(X_train, y_train, X_test, y_test, normalize=False): print("-------------------------- Lasso Regression") clf = linear_model.LassoCV(alphas=np.arange(0.1, 2, 0.1), max_iter=5000) clf.fit(X_train, y_train) # Make predictions using the testing set y_pred = clf.predict(X_test) # The intercept print("Intercept: %.4f" % clf.intercept_) # The mean squared error print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)) # Explained variance score: 1 is perfect prediction print('Coefficient of determination(R^2): %.2f' % r2_score(y_test, y_pred)) # The coefficients cols = X_train.columns.tolist() coef = clf.coef_.tolist() coef = list(zip(cols, coef)) df_coef = pd.DataFrame.from_records(coef) print('Coefficients: \n', df_coef.T) print('Alpha: \n', clf.alpha_) return clf
import numpy as np from sklearn import cross_validation, datasets, linear_model diabetes = datasets.load_diabetes() X = diabetes.data y = diabetes.target alphas = np.logspace(-4, -.5, 30) lasso_cv = linear_model.LassoCV(alphas=alphas) k_fold = cross_validation.KFold(len(X), 5) alphas = np.logspace(-4, -.5, 30) for k, (train, test) in enumerate(k_fold): lasso_cv.fit(X[train], y[train]) print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}". format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))
speed_corr_neurons = template_info.loc[speed_corr_neurons_index] # Standartize X, Y Y = preprocessing.StandardScaler().\ fit_transform(np.reshape(speeds_0p25[:-1], (-1, 1))).reshape(-1) X = preprocessing.StandardScaler().\ fit_transform(spike_rates_0p25[speed_corr_neurons_index].transpose()) # Or not Y = np.array(speeds_0p25) X = spike_rates_0p25[speed_corr_neurons_index].transpose() # Set up the regressors model_linear = linear_model.LinearRegression(fit_intercept=True) model_lassoCV = linear_model.LassoCV(cv=5, fit_intercept=True) model_lasso = linear_model.Lasso(alpha=0.02, fit_intercept=True, max_iter=10000, normalize=False) model_gam = pg.LinearGAM() Y = np.exp(Y) / (np.exp(Y) + 1) model_gam = pg.GammaGAM() # Split the data to train and test sets X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X, Y, test_size=0.2, random_state=0) # Fit model = model_gam
# boston_X_test_scaled = boston_X_scaled[-20:] # boston_y_train = boston.target[:-20] # boston_y_test = boston.target[-20:] # Prepare ensemble regressors regressors = ( linear_model.LinearRegressi on(fit_intercept=True), Pipeline( [('poly', PolynomialFeatures(degree=2)), ('linear', linear_model.LinearRegression(fit_intercept=False))] ), linear_model.Ridge(alpha=.1, fit_intercept=True), linear_model.RidgeCV(alphas=[.01, .1, .3, .5, 1], fit_intercept=True), linear_model.Lasso(alpha=1, fit_intercept=True), linear_model.LassoCV(n_alphas=100, fit_intercept=True), linear_model.ElasticNet(alpha=1), linear_model.ElasticNetCV(n_alphas=100, l1_ratio=.5), linear_model.OrthogonalMatchingPursuit(), linear_model.BayesianRidge(), linear_model.ARDRegression(), linear_model.SGDRegressor(), linear_model.PassiveAggressiveRegressor(loss='squared_epsilon_insensitive'), linear_model.RANSACRegressor(), LinearSVR(max_iter=1e4, fit_intercept=True, loss='squared_epsilon_insensitive', C=0.5), SVR(max_iter=1e4, kernel='poly', C=1, degree=4), SVR(max_iter=1e4, kernel='rbf', C=1, gamma=0.1), SVR(kernel='linear', C=1), SVR(kernel='linear', C=0.5), SVR(kernel='linear', C=0.1), DecisionTreeRegressor(max_depth=5),
l_eye = l[idx] * np.eye(X.shape[1]) H_ridge = np.linalg.inv(X.T.dot(X) + l_eye) beta_ridge = H_ridge.dot(X.T).dot(z_flat) z_tilde_ridge = X @ beta_ridge plt.figure() plt.title('Terrain data from Norway after ridge regression') plt.imshow(np.reshape(z_tilde_ridge, np.shape(z)), cmap='gray') plt.xlabel('X') plt.ylabel('Y') save_fig('SRTM_data_Norway_1_ridge_regression') # lasso lasso=linear_model.LassoCV(max_iter=1000000, cv=5) lasso.fit(X[:,1:], z_flat) predl=lasso.predict(X[:,1:]) plt.figure() plt.title('Terrain data from Norway after lasso') plt.imshow(np.reshape(predl, np.shape(z)), cmap='gray') plt.xlabel('X') plt.ylabel('Y') save_fig('SRTM_data_Norway_1_lasso') ## compressing image data (unused) r = 20 U, S, V = np.linalg.svd(z) #using SVD method to decompose image
def __init__( self, method, yrange, params, i=0 ): #TODO: yrange doesn't currently do anything. Remove or do something with it! self.algorithm_list = [ 'PLS', 'GP', 'OLS', 'OMP', 'Lasso', 'Elastic Net', 'Ridge', 'Bayesian Ridge', 'ARD', 'LARS', 'LASSO LARS', 'SVR', 'KRR', ] self.method = method self.outliers = None self.ransac = False print(params) if self.method[i] == 'PLS': self.model = PLSRegression(**params[i]) if self.method[i] == 'OLS': self.model = linear.LinearRegression(**params[i]) if self.method[i] == 'OMP': # check whether to do CV or not self.do_cv = params[i]['CV'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove CV parameter params_temp.pop('CV') if self.do_cv is False: self.model = linear.OrthogonalMatchingPursuit(**params_temp) else: params_temp.pop('precompute') self.model = linear.OrthogonalMatchingPursuitCV(**params_temp) if self.method[i] == 'LASSO': # create a temporary set of parameters params_temp = copy.copy(params[i]) # check whether to do CV or not try: self.do_cv = params[i]['CV'] # Remove CV parameter params_temp.pop('CV') except: self.do_cv = False if self.do_cv is False: self.model = linear.Lasso(**params_temp) else: params_temp.pop('alpha') self.model = linear.LassoCV(**params_temp) if self.method[i] == 'Elastic Net': params_temp = copy.copy(params[i]) try: self.do_cv = params[i]['CV'] params_temp.pop('CV') except: self.do_cv = False if self.do_cv is False: self.model = linear.ElasticNet(**params_temp) else: params_temp['l1_ratio'] = [.1, .5, .7, .9, .95, .99, 1] self.model = linear.ElasticNetCV(**params_temp) if self.method[i] == 'Ridge': # create a temporary set of parameters params_temp = copy.copy(params[i]) try: # check whether to do CV or not self.do_cv = params[i]['CV'] # Remove CV parameter params_temp.pop('CV') except: self.do_cv = False if self.do_cv: self.model = linear.RidgeCV(**params_temp) else: self.model = linear.Ridge(**params_temp) if self.method[i] == 'BRR': self.model = linear.BayesianRidge(**params[i]) if self.method[i] == 'ARD': self.model = linear.ARDRegression(**params[i]) if self.method[i] == 'LARS': # create a temporary set of parameters params_temp = copy.copy(params[i]) try: # check whether to do CV or not self.do_cv = params[i]['CV'] # Remove CV parameter params_temp.pop('CV') except: self.do_cv = False if self.do_cv is False: self.model = linear.Lars(**params_temp) else: self.model = linear.LarsCV(**params_temp) if self.method[i] == 'LASSO LARS': model = params[i]['model'] params_temp = copy.copy(params[i]) params_temp.pop('model') if model == 0: self.model = linear.LassoLars(**params_temp) elif model == 1: self.model = linear.LassoLarsCV(**params_temp) elif model == 2: self.model = linear.LassoLarsIC(**params_temp) else: print("Something went wrong, \'model\' should be 0, 1, or 2") if self.method[i] == 'SVR': self.model = svm.SVR(**params[i]) if self.method[i] == 'KRR': self.model = kernel_ridge.KernelRidge(**params[i]) if self.method[i] == 'GP': # get the method for dimensionality reduction and the number of components self.reduce_dim = params[i]['reduce_dim'] self.n_components = params[i]['n_components'] # create a temporary set of parameters params_temp = copy.copy(params[i]) # Remove parameters not accepted by Gaussian Process params_temp.pop('reduce_dim') params_temp.pop('n_components') self.model = GaussianProcess(**params_temp)
# print('Final prediction score with optimal: [%.8f](use RF)' % mean_absolute_error(y_test, y_pred)) # # '''5. just use the EXTRA''' # model = ExtraTreesRegressor(random_state=0, n_estimators=100) # model.fit(X_train, y_train) # y_pred = model.predict(X_test) # print('Final prediction score with optimal: [%.8f](use EXTRA)' % mean_absolute_error(y_test, y_pred)) # # '''6. just use the XGB''' # model = XGBRegressor(random_state=0,n_estimators=100, n_jobs=-1) # model.fit(X_train, y_train) # y_pred = model.predict(X_test) # print('Final prediction score with optimal: [%.8f](use XGB)' % mean_absolute_error(y_test, y_pred)) models = [ linear_model.LassoCV(random_state=0, n_jobs=1), GradientBoostingRegressor(random_state=0), SVR(), RandomForestRegressor(random_state=0, n_jobs=1, n_estimators=100, max_depth=3), ExtraTreesRegressor(random_state=0, n_estimators=100), XGBRegressor(random_state=0, n_estimators=100, n_jobs=1) ] # # '''With optimal''' model = Superknn(models=models, metric=mean_absolute_error, n_jobs=10, random_state=0, folds=5)
def selectByLassoL1(X, y, value, diagnose=False): ''' Select the features using lasso regression Parameters ---------- X : TYPE pandas dataframe DESCRIPTION. dataframe of the attributes y : TYPE pandas dataframe DESCRIPTION. dataframe with the target variable value : TYPE float DESCRIPTION. correlation treshold 0->1 diagnose : TYPE, optional if true generates the correlation of each variable to the target var DESCRIPTION. The default is False. Returns ------- res : TYPE pandas dataframe DESCRIPTION. output dataframe with selected variables output_figure : TYPE dictionary DESCRIPTION. dictionary containing the figures with the plot of the selected features depending o the variance ''' output_figure = {} #lasso feature selection works for regression models Q = dummyColumns(X) clf = linear_model.LassoCV(cv=5) if diagnose: numFeatSelected = [] for i in range(1, 101): val = i / 100 sfm = SelectFromModel(clf, threshold=val) sfm.fit(Q, y) nn = len(Q.columns[sfm.get_support()]) #print(nn) numFeatSelected.append(nn) fig1 = plt.figure() plt.plot(range(1, 101), numFeatSelected) plt.title('Lasso graph') plt.xlabel('Coeff threshold %') plt.ylabel('Num. selected features') output_figure['LassoChart'] = fig1 plt.close('all') #_, _, _, alphaValue, _= LassoRegressionCV(Q,y,'',nFolds=5, saveFig=False) #previous version with lasso from ZO_RegressionLinearModel #las=LassoRegressionComplete(Q,y,alphaValue) sfm = SelectFromModel(clf, threshold=value) sfm.fit(Q, y) #n_features = sfm.transform(X).shape[1] #model = SelectFromModel(las, prefit=True,threshold=0.25) Z = sfm.transform(Q) feature_idx = sfm.get_support() feature_name = Q.columns[feature_idx] res = pd.DataFrame(Z, columns=feature_name) return res, output_figure
def organize_data(ManagerID, data_mktbeta, data_indubeta, data_FAndMdata, startdatestr, enddatestr, ob_win): """ give ManagerID, organize data to store in sql ob_win: ob window length return result_df """ # print(datetime.datetime.now()) # print('Start organize data of: ' + ManagerID) # read mng record data data_allrecord = data_FAndMdata[data_FAndMdata.ManagerID == ManagerID] # store cols = ['ID', 'EndDate', 'InvestAdvisor', 'ManagerID', 'Ret'] result_df = pd.DataFrame(columns=cols) # store here # 先算出复合收益率 time_array = data_allrecord.EndDate.unique() for date in time_array: data_subrecord = data_allrecord[data_allrecord.EndDate == date] wgted_ret = 0 wgt = 0 InAdv = '' for index, row in data_subrecord.iterrows(): wgt += 1 / row.ManagersofFund wgted_ret += row.dailyreturn / row.ManagersofFund InAdv = row.InvestAdvisorAbbrName ret = wgted_ret / wgt if np.isnan(ret): continue else: IDstr = ManagerID + pd.to_datetime(date).strftime('%Y%m%d') result_df = result_df.append(dict( zip(cols, [IDstr, date, InAdv, ManagerID, ret])), ignore_index=True) # 业绩分解 # fetch index data and construct np.array addcols = [ 'beta_mkt1', 'beta_mkt2', 'beta_mkt3', 'name_mkt1', 'name_mkt2', 'name_mkt3', 'intercept_mkt', 'score_mkt', 'bias_ret_mkt', 'bias_var_mkt', 'bias_score_mkt', # bias mkt: 风格的偏离 'beta_indu1', 'beta_indu2', 'beta_indu3', 'name_indu1', 'name_indu2', 'name_indu3', 'intercept_indu', 'score_indu', 'bias_ret_indu', 'bias_var_indu', 'bias_score_indu' ] # bias indu:行业的偏离 newcols = cols + addcols result_df = result_df.reindex(columns=newcols) idx = 0 # total = len(result_df) while idx < len(result_df): if idx < ob_win - 1: # no enough data # print('line:' + str(idx) + '/' + str(total) + ', skip') idx += 1 continue else: # get date obdates = result_df.iloc[(idx - ob_win + 1):(idx + 1)].EndDate timegap = (obdates.iloc[-1] - obdates.iloc[0]).days if timegap / ob_win > 9 / 5: # dates not continuous # print('line:' + str(idx) + '/' + str(total) + ', skip') idx += 1 continue else: # calc mng_ret = result_df.iloc[(idx - ob_win + 1):(idx + 1)].Ret mkt_ret = data_mktbeta.loc[obdates] indu_ret = data_indubeta.loc[obdates] mng_ret = mng_ret.values mkt_ret = mkt_ret.values indu_ret = indu_ret.values # remove NaN rows isnanrow = np.isnan(mkt_ret[:, 1]) mng_ret = mng_ret[~isnanrow] mkt_ret = mkt_ret[~isnanrow] indu_ret = indu_ret[~isnanrow] # define mkt model model = linear_model.LassoCV( positive=True, cv=int(ob_win / 30), # subsample size = 30 selection='random', fit_intercept=True, normalize=False) # mkt model.fit(mkt_ret, mng_ret) # fit(X, y) beta_mkt = model.coef_ name_mkt = data_mktbeta.columns.values sortedidx = np.argsort(beta_mkt) result_df.ix[idx, 'beta_mkt1'] = beta_mkt[sortedidx[-1]] result_df.ix[idx, 'name_mkt1'] = name_mkt[sortedidx[-1]] result_df.ix[idx, 'beta_mkt2'] = beta_mkt[sortedidx[-2]] result_df.ix[idx, 'name_mkt2'] = name_mkt[sortedidx[-2]] result_df.ix[idx, 'beta_mkt3'] = beta_mkt[sortedidx[-3]] result_df.ix[idx, 'name_mkt3'] = name_mkt[sortedidx[-3]] result_df.ix[idx, 'intercept_mkt'] = model.intercept_ result_df.ix[idx, 'score_mkt'] = model.score(mkt_ret, mng_ret) # bias mkt # calc ret b_avg = np.mean(beta_mkt) b_adj = beta_mkt - b_avg ct = 1 / np.sum(b_adj[b_adj > 0]) # scale factor b_adj = b_adj * ct bias_retts_mkt = np.dot(mkt_ret, b_adj) # dot成个加权的收益率 temp = np.mean(bias_retts_mkt) * 250 # daily ret 的年化 if np.isnan(temp): temp = 0 result_df.ix[idx, 'bias_ret_mkt'] = temp temp = np.std(bias_retts_mkt) * 250**0.5 # daily ret std 的年化 if np.isnan(temp): temp = 0 result_df.ix[idx, 'bias_var_mkt'] = temp # calc score # std coef result_df.ix[idx, 'bias_score_mkt'] = np.std(beta_mkt) # define indu model model = linear_model.LassoCV( positive=True, cv=int(ob_win / 30), # subsample size = 30 selection='random', fit_intercept=True, normalize=False) # indu model.fit(indu_ret, mng_ret) beta_indu = model.coef_ name_indu = data_indubeta.columns.values sortedidx = np.argsort(beta_indu) result_df.ix[idx, 'beta_indu1'] = beta_indu[sortedidx[-1]] result_df.ix[idx, 'name_indu1'] = name_indu[sortedidx[-1]] result_df.ix[idx, 'beta_indu2'] = beta_indu[sortedidx[-2]] result_df.ix[idx, 'name_indu2'] = name_indu[sortedidx[-2]] result_df.ix[idx, 'beta_indu3'] = beta_indu[sortedidx[-3]] result_df.ix[idx, 'name_indu3'] = name_indu[sortedidx[-3]] result_df.ix[idx, 'intercept_indu'] = model.intercept_ result_df.ix[idx, 'score_indu'] = model.score(indu_ret, mng_ret) # bias indu # calc ret b_avg = np.mean(beta_indu) b_adj = beta_indu - b_avg ct = 1 / np.sum(b_adj[b_adj > 0]) # scale factor b_adj = b_adj * ct bias_retts_indu = np.dot(indu_ret, b_adj) # dot成个加权的收益率 temp = np.mean(bias_retts_indu) * 250 # daily ret 的年化 if np.isnan(temp): temp = 0 result_df.ix[idx, 'bias_ret_indu'] = temp temp = np.std(bias_retts_indu) * 250**0.5 # daily ret std 的年化 if np.isnan(temp): temp = 0 result_df.ix[idx, 'bias_var_indu'] = temp # calc score # std coef result_df.ix[idx, 'bias_score_indu'] = np.std(beta_indu) # end of calc idx += 1 # print('line:' + str(idx) + '/' + str(total) + ', done') # end of while loop # 截取starttime和endtime之间的result sdtime = datetime.datetime.strptime(startdatestr, '%Y-%m-%d') edtime = datetime.datetime.strptime(enddatestr, '%Y-%m-%d') result_df = result_df[(result_df.EndDate >= sdtime) & (result_df.EndDate <= edtime)] # print(datetime.datetime.now()) print('End organize data of: ' + ManagerID) return result_df
def computeRscores(product_features_list, product_ratings_list, num_features, file_name): from sklearn import linear_model result = [] #创建一个文件夹 import os new_file_dir = path + "data\\feature_coefficient_list_" + file_name.split( ".")[0] + "\\" isExists = os.path.exists(new_file_dir) if not isExists: os.makedirs(new_file_dir) new_file_name = new_file_dir + file_name.split(".")[0] + "_" #第一种,直接利用linear regression print "start to linear regression" copy_product_features_list = copy.deepcopy(product_features_list) copy_product_ratings_list = copy.deepcopy(product_ratings_list) reg = linear_model.LinearRegression() reg.fit(copy_product_features_list, copy_product_ratings_list) del copy_product_features_list del copy_product_ratings_list file_path = new_file_name + "linear_regression_" + str( num_features) + ".xls" save_coefficients_new(reg.coef_, reg.intercept_, file_path) copy_product_features_list = copy.deepcopy(product_features_list) copy_product_ratings_list = copy.deepcopy(product_ratings_list) r2_linearregression = reg.score(copy_product_features_list, copy_product_ratings_list) print "r2 score: ", r2_linearregression del copy_product_features_list del copy_product_ratings_list #第二种,lasso print "start to lasso regression" copy_product_features_list = copy.deepcopy(product_features_list) copy_product_ratings_list = copy.deepcopy(product_ratings_list) reg = linear_model.LassoCV(cv=5, random_state=0) reg.fit(copy_product_features_list, copy_product_ratings_list) del copy_product_features_list del copy_product_ratings_list file_path = new_file_name + "linear_lassocv_regression_" + str( num_features) + ".xls" save_coefficients_new(reg.coef_, reg.intercept_, file_path) copy_product_features_list = copy.deepcopy(product_features_list) copy_product_ratings_list = copy.deepcopy(product_ratings_list) r2_lasso = reg.score(copy_product_features_list, copy_product_ratings_list) print "r2 score: ", r2_lasso del copy_product_features_list del copy_product_ratings_list result = reg.coef_ #第三种 ridge print "start to ridge regression" copy_product_features_list = copy.deepcopy(product_features_list) copy_product_ratings_list = copy.deepcopy(product_ratings_list) reg = linear_model.RidgeCV(cv=5) reg.fit(copy_product_features_list, copy_product_ratings_list) del copy_product_features_list del copy_product_ratings_list file_path = new_file_name + "linear_ridge_regression_" + str( num_features) + ".xls" save_coefficients_new(reg.coef_, reg.intercept_, file_path) copy_product_features_list = copy.deepcopy(product_features_list) copy_product_ratings_list = copy.deepcopy(product_ratings_list) r2_ridge = reg.score(copy_product_features_list, copy_product_ratings_list) print "r2 score: ", r2_ridge del copy_product_features_list del copy_product_ratings_list return result
#%% import sklearn.linear_model as sk_lm import sklearn.ensemble as sk_ens import xgboost as xgb import sklearn.neural_network as sk_nn from sklearn.metrics import r2_score from sklearn.model_selection import StratifiedKFold, KFold df_X_train = df_non_obj_feats[:train_len] df_X_test = df_non_obj_feats[train_len:] y = target_log1p df_importance = pd.DataFrame(data=None, index=df_non_obj_feats.columns) #%% model_lasso = sk_lm.LassoCV(alphas=[3e-4, 3e-3, 3e-2, 3e-1, 3, 30]) model_lasso.fit(df_X_train, y) model_lasso.score(df_X_train, y) r2_score_lasso = r2_score(y, model_lasso.predict(df_X_train)) print('r2_score of Lasso:', r2_score_lasso) lasso_importance = pd.DataFrame(model_lasso.coef_, df_X_train.columns,['LS_feat_importance']) lasso_importance.plot(); plt.show() df_importance['LassoCV'] = r2_score_lasso*lasso_importance/np.max(np.abs(lasso_importance)) #%% model_elen = sk_lm.ElasticNetCV(alphas=[3e-4, 3e-3, 3e-2, 3e-1, 3, 30]) model_elen.fit(df_X_train, y) model_elen.score(df_X_train, y) r2_score_elen = r2_score(y, model_elen.predict(df_X_train)) print('r2_score of ElasticNet:', r2_score_elen)
def main(): pd.set_option('display.max_columns', None) from sklearn.neighbors import KNeighborsClassifier data = pd.read_csv('Life Expectancy Data.csv') # print(data.columns) data.rename(columns={'Life expectancy ': "Life_expectancy"}, inplace=True) data.rename(columns={'Adult Mortality': "Adult_Mortality"}, inplace=True) data.rename(columns={'infant deaths': "infant_deaths"}, inplace=True) data.rename(columns={'percentage expenditure': 'percentage_expenditure'}, inplace=True) data.rename(columns={'Hepatitis B': "Hepatitis_B"}, inplace=True) data.rename(columns={' BMI ': "BMI"}, inplace=True) data.rename(columns={'under-five deaths ': "under-five_deaths"}, inplace=True) data.rename(columns={'Total expenditure': "Total_expenditure"}, inplace=True) data.rename(columns={' HIV/AIDS': "HIV/AIDS"}, inplace=True) data.rename(columns={' thinness 1-19 years': "thinness_1-19_years"}, inplace=True) data.rename(columns={' thinness 5-9 years': "thinness_5-9_years"}, inplace=True) data.rename(columns={'Income composition of resources': "Income_composition_of_resources"}, inplace=True) data.rename(columns={'HIV/AIDS': "HIV_AIDS"}, inplace=True) data.rename(columns={'Measles ': "Measles"}, inplace=True) data.rename(columns={'Diphtheria ': "Diphtheria"}, inplace=True) # delet the data with null life expectancy value data['Life_expectancy'] = data['Life_expectancy'].fillna(999) drop_index = data[(data.Life_expectancy == 999)].index.tolist() data = data.drop(drop_index) # print(data['Life_expectancy']) # make life_expectancy as our output labels = data.loc[:, ['Life_expectancy']] # del data['Life_expectancy'] # deal with categorical data "status" since it contains numerical quality data['Status'] = data['Status'].str.replace('Developed', '2', case=False) data['Status'] = data['Status'].str.replace('Developing', '1', case=False) # print('data=',data) # data=pd.get_dummies(data, prefix=['Country'], columns=['Country']) # print('size after dummy', data.shape) # Separate the data to train, val and test data x, x_test, y, y_test = train_test_split(data, labels, test_size=0.2, train_size=0.8, random_state=1) x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=50) # print(x_train) # calculate the null value in each column NoNull_train = x_train.isnull().sum(axis=0) # print('--------Null data in train----------') # print(NoNull_train) # print('--------Null data in train----------') ''' sns.distplot(y_train['Life_expectancy']) mu=y_train['Life_expectancy'].mean() sigma=y_train['Life_expectancy'].std() #Now plot the distribution plt.legend(['Original dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best') plt.ylabel('Frequency') plt.title('Life_expectancy') #Get also the QQ-plot fig = plt.figure() res = stats.probplot(y_train['Life_expectancy'], plot=plt) plt.show() ''' # make the target_train data become more closed to the normal distribution # ---------------------------------------------- # We use the numpy fuction log1p which applies log(1+x) to all elements of the column ''' #Check the new distribution sns.distplot(y_train['Life_expectancy'] , fit=norm); # Get the fitted parameters used by the function (mu, sigma) = norm.fit(y_train['Life_expectancy']) print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) #Now plot the distribution plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],loc='best') plt.ylabel('Frequency') plt.title('Life Expectancy') #Get also the QQ-plot fig = plt.figure() res = stats.probplot(y_train['Life_expectancy'], plot=plt) plt.show() ''' Null_train_ratio = (x_train.isnull().sum() / len(x_train)) * 100 Null_train_ratio = Null_train_ratio.sort_values(ascending=False) AllNull_train_ratio = Null_train_ratio.drop(Null_train_ratio[Null_train_ratio == 0].index) missing_train_ratio = pd.DataFrame({'Missing train data ratio': AllNull_train_ratio}) # print(missing_train_ratio) f, ax = plt.subplots(figsize=(15, 12)) plt.xticks(rotation='90') # ratate direction of words for each feature sns.barplot(x=Null_train_ratio.index, y=Null_train_ratio) plt.xlabel('Features', fontsize=15) plt.ylabel('Percent of missing values', fontsize=15) plt.title('missing data percentage by feature', fontsize=15) plt.show() # print("--------------train data description-------------") # print(x_train.describe()) # draw to recognize the outliers # Show the boxplot before dealing with outliers # feature "Adult_Mortality" # showplot(x_train,'before') # try to remove the outliers # print('len of x train=',len(x_train)) Q1 = x_train.quantile(0.25) Q3 = x_train.quantile(0.75) IQR = Q3 - Q1 x_train = outlier_remove_traindata(x_train, Q1, Q3, IQR) x_val = outlier_remove_traindata(x_val, Q1, Q3, IQR) # print('finish removing the outliers') # ----------finished dealing with the outlier in each features------------- # Show the boxplot after dealing with outliers # feature "Adult_Mortality" # showplot(x_train,'after') ####fill the missing data with mean value x_train = x_train.fillna(x_train.mean()) x_val = x_val.fillna(x_train.mean()) x = x.fillna(x.mean()) x_test = x_test.fillna(x.mean()) ####fill the missing data with median # x_train=x_train.fillna(x_train.median()) # x_val=x_val.fillna(x_train.median()) ####fill the missing data with -1 # x_train=x_train.fillna(-1) # x_val=x_val.fillna(-1) ####fill the missing data with 0 # x_train=x_train.fillna(0) # x_val=x_val.fillna(0) NoNull_train = x_train.isnull().sum(axis=0) # print('--------Null data in train----------') # print('NoNull_train',NoNull_train) # separate the string and numerical data x_train_str = x_train.loc[:, ["Country"]] x_train_num = x_train.loc[:, ['Year', 'Status', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths', 'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria', 'HIV_AIDS', 'GDP', 'Population', 'thinness_1-19_years', 'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling']] x_val_str = x_val.loc[:, ["Country"]] x_val_num = x_val.loc[:, ['Year', 'Status', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths', 'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria', 'HIV_AIDS', 'GDP', 'Population', 'thinness_1-19_years', 'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling']] x_str = x.loc[:, ["Country"]] x_num = x.loc[:, ['Year', 'Status', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths', 'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria', 'HIV_AIDS', 'GDP', 'Population', 'thinness_1-19_years', 'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling']] x_test_num = x_test.loc[:, ['Year', 'Status', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths', 'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria', 'HIV_AIDS', 'GDP', 'Population', 'thinness_1-19_years', 'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling']] x_train_str = pd.get_dummies(x_train_str) # Try to see the correlation between country(after get dummy) and life expectancy country_col = x_train_str.columns # print('country column =',len(country_col)) # Try to see the correlation between country(after get dummy) and life expectancy x_train_str["Life_expectancy"] = y_train country_corrmat = x_train_str.corr() cols = abs(country_corrmat).nlargest(10, 'Life_expectancy')['Life_expectancy'].index cm = np.corrcoef(x_train_str[cols].values.T) sns.set(font_scale=1.25) plt.subplots(figsize=(15, 12)) hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values) bottom, top = hm.get_ylim() hm.set_ylim(bottom + 0.5, top - 0.5) plt.title('The country that is most related to life expectancy') plt.show() # Since the highest correlation between country and life expectancy is 0.17, we decide not to use the feature "country" # standardize the data standar = preprocessing.StandardScaler().fit(x_train_num) x_train = standar.transform(x_train_num) x_val = standar.transform(x_val_num) x_train1 = pd.DataFrame(data=x_train, columns=x_train_num.columns) x_val1 = pd.DataFrame(data=x_val, columns=x_train_num.columns) # Correlation map to see how features are correlated with LifeExpectancy corrmat = x_train1.corr() plt.subplots(figsize=(18, 15)) ax = sns.heatmap(corrmat, vmax=1, annot=True, square=True, vmin=0) bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) plt.title('Correlation Heatmap Between Each Feature') plt.show() cols = abs(corrmat).nlargest(19, 'Life_expectancy')['Life_expectancy'].index cm = np.corrcoef(x_train1[cols].values.T) sns.set(font_scale=1.25) plt.subplots(figsize=(15, 12)) plt.title('18 Features that are most related to Life Expectancy') hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values) bottom, top = hm.get_ylim() hm.set_ylim(bottom + 0.5, top - 0.5) plt.show() cols = abs(corrmat).nlargest(21, 'Life_expectancy')['Life_expectancy'].index related_col = cols.drop(['Life_expectancy']).drop(['Status']).drop(['Hepatitis_B']).drop(['infant_deaths']).drop( ['GDP']).drop(['Measles']).drop(['Population']).drop(['percentage_expenditure']).drop(['Diphtheria']) # related_col = related_col.drop(['Status']) # related_col = related_col.drop(['under-five_deaths']) # print("The columns most related to Life expectancy=", related_col) # transform the test data standar_all = preprocessing.StandardScaler().fit(x_num) x = standar_all.transform(x_num) x_test = standar_all.transform(x_test_num) x1 = pd.DataFrame(data=x, columns=x_train_num.columns) x_test1 = pd.DataFrame(data=x_test, columns=x_train_num.columns) x_train = x_train1[related_col] x_val = x_val1[related_col] x = x1[related_col] x_test = x_test1[related_col] ''' # Choose the optimal no of features => 18 features (k=19, we need to deduct 'life expectancy') meanerror_NoFeature_val = [] mse_NoFeature_val = [] for k in range(5, 21): cols = abs(corrmat).nlargest(k, 'Life_expectancy')['Life_expectancy'].index related_col = cols.drop(['Life_expectancy']) x_train = x_train1[related_col] x_val = x_val1[related_col] mean_train = np.mean(y_train) mean_train_array = [x * mean_train for x in np.ones(y_train.shape[0])] np.ones(y_train.shape) # We found that the min MSE happened when n_estimators=25 error_train = [] error_val = [] mse_train = [] mse_val = [] for j in range(10): pre_train, pre_X_train_pick, pre_y_train, pre_y_train_pick = train_test_split(x_train, y_train, test_size=1 / 3) RF = RandomForestRegressor(n_estimators=25, bootstrap=True, max_features=3) RF.fit(pre_X_train_pick, pre_y_train_pick.values.ravel()) predict_train = RF.predict(x_train) predict_val = RF.predict(x_val) acc_train = RF.score(x_train, y_train) acc_val = RF.score(x_val, y_val) error_train = np.append(error_train, 1 - acc_train) error_val = np.append(error_val, 1 - acc_val) mse_train = np.append(mse_train, mean_squared_error(predict_train, y_train)) mse_val = np.append(mse_val, mean_squared_error(predict_val, y_val)) meanerror_val = np.mean(error_val) mean_mse_val = np.mean(mse_val) meanerror_NoFeature_val = np.append(meanerror_NoFeature_val, meanerror_val) mse_NoFeature_val = np.append(mse_NoFeature_val, mean_mse_val) # print('mean error in validation set when 4~19 features') # print(meanerror_NoFeature_val) # print(' ') # print('mse in validation set when 4~19 features') # print(mse_NoFeature_val) print('When we set the number of features from 4-19,') print('min mean error = %.2f and min MSE = %.2f in validation set when we chose %.0f correlated features' % ( min(meanerror_NoFeature_val), min(mse_NoFeature_val), np.argmin(meanerror_NoFeature_val) + 4)) X = np.arange(4, 20) plt.plot(X, meanerror_NoFeature_val, label='Mean Error') plt.title('Number of Features vs Mean Error') plt.ylabel('Mean Error') plt.xlabel('Number of features') plt.show() plt.plot(X, mse_NoFeature_val, label='MSE') plt.title('Number of features vs MSE') plt.ylabel('MSE') plt.xlabel('Number of features') plt.show() ''' ################Regression Tree######################### # Find the best way to fill the missing data print('###################Random Forest Regression########################') meanerror_train = [] meanerror_val = [] mean_mse_train = [] mean_mse_val = [] for i in range(50): error_train = [] error_val = [] mse_train = [] mse_val = [] for j in range(10): pre_train, pre_X_train_pick, pre_y_train, pre_y_train_pick = train_test_split(x_train, y_train, test_size=1 / 3) RF = RandomForestRegressor(n_estimators=i + 1, bootstrap=True, random_state=0) RF.fit(pre_X_train_pick, pre_y_train_pick.values.ravel()) predict_train = RF.predict(x_train) predict_val = RF.predict(x_val) acc_train = RF.score(x_train, y_train) acc_val = RF.score(x_val, y_val) error_train = np.append(error_train, 1 - acc_train) error_val = np.append(error_val, 1 - acc_val) mse_train = np.append(mse_train, mean_squared_error(predict_train, y_train)) mse_val = np.append(mse_val, mean_squared_error(predict_val, y_val)) meanerror_train = np.append(meanerror_train, np.mean(error_train)) meanerror_val = np.append(meanerror_val, np.mean(error_val)) mean_mse_train = np.append(mean_mse_train, np.mean(mse_train)) mean_mse_val = np.append(mean_mse_val, np.mean(mse_val)) # print('When fill the missing data with mean:') # print('mean error in training set =', meanerror_train) # print('mean error in validation set =', meanerror_val) # print('MSE in training set =',mean_mse_train) # print('MSE in validation set =',mean_mse_val) print("we got the min MSE value=%.3f and error rate=%.3f in validation set when there are %.0f trees" % ( min(mean_mse_val), min(meanerror_val), np.argmin(mean_mse_val) + 1)) fi = pd.DataFrame({'feature': list(x_train.columns), 'importance': RF.feature_importances_}). \ sort_values('importance', ascending=False) #print('importance=', fi) # plot the figure X = np.arange(1, 51) fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1) fig.suptitle('Random Forest Regression') ax1.plot(X, meanerror_train, label='train data') ax1.plot(X, meanerror_val, color='r', label='val data') ax1.set_ylabel('Error Rate') ax1.plot(np.argmin(meanerror_val) + 1, min(meanerror_val), '*', label='minimum', color='b', markersize=15) ax1.legend(loc='best') ax2.plot(X, mean_mse_train, label='train data') ax2.plot(X, mean_mse_val, color='r', label='val data') ax2.set_ylabel('MSE') ax2.set_xlabel('Number of trees') ax2.plot(np.argmin(mean_mse_val) + 1, min(mean_mse_val), '*', label='minimum', color='b', markersize=15) ax2.legend(loc='best') plt.show() ''' #We found that the min MSE happened when n_estimators=25 error_train = [] error_val = [] mse_train = [] mse_val = [] for j in range(10): pre_train, pre_X_train_pick, pre_y_train, pre_y_train_pick = train_test_split(x_train, y_train, test_size=1 / 3) RF = RandomForestRegressor(n_estimators=25, bootstrap=True, max_features=3) RF.fit(pre_X_train_pick, pre_y_train_pick.values.ravel()) predict_train = RF.predict(x_train) predict_val = RF.predict(x_val) acc_train = RF.score(x_train, y_train) acc_val = RF.score(x_val, y_val) error_train = np.append(error_train, 1 - acc_train) error_val = np.append(error_val, 1 - acc_val) mse_train = np.append(mse_train, mean_squared_error(predict_train, y_train)) mse_val = np.append(mse_val, mean_squared_error(predict_val, y_val)) meanerror_train = np.mean(error_train) meanerror_val = np.mean(error_val) mean_mse_train = np.mean(mse_train) mean_mse_val = np.mean(mse_val) print('When fill the missing data with 0 and n_estimator = 25:') print('mean error in training set =', meanerror_train) print('mean error in validation set =', meanerror_val) print('MSE in training set =',mean_mse_train) print('MSE in validation set =',mean_mse_val) ''' #############Linear Regression################ # linear regression (general) print('###################linear regression (general)########################') lin_mse_val = [] lin_error_val = [] lin_mse_train = [] lin_error_train = [] corrmat = x_train1.corr() for k in range(5, 21): cols = abs(corrmat).nlargest(k, 'Life_expectancy')['Life_expectancy'].index related_col = cols.drop(['Life_expectancy']) x_train = x_train1[related_col] x_val = x_val1[related_col] reg = linear_model.LinearRegression() reg.fit(x_train, y_train) reg_predict_val = reg.predict(x_val) reg_predict_train = reg.predict(x_train) reg_acc_val = reg.score(x_val, y_val) reg_acc_train = reg.score(x_train, y_train) lin_mse_val = np.append(lin_mse_val, mean_squared_error(y_val, reg_predict_val)) lin_mse_train = np.append(lin_mse_train, mean_squared_error(y_train, reg_predict_train)) lin_error_val = np.append(lin_error_val, 1 - reg_acc_val) lin_error_train = np.append(lin_error_train, 1 - reg_acc_train) print('Linear regression when featrues=4-19') # print('error rate in validation set =') # print(lin_error_val) # print('MSE in validation set') # print(lin_mse_val) print('We can find the min error rate= %.4f and min MSE= %.4f when there are %.0f features' % ( min(lin_error_val), min(lin_mse_val), np.argmin(lin_mse_val) + 4)) X = np.arange(4, 20, 1) fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1) fig.suptitle('Linear Regression') ax1.plot(X, lin_error_val, color='r', label='Validation set') ax1.plot(X, lin_error_train, label='Training set') ax1.set_ylabel('Error Rate') ax1.plot(np.argmin(lin_error_val) + 4, min(lin_error_val), '*', label='minimum', color='b', markersize=15) ax1.legend(loc='best') ax2.plot(X, lin_mse_val, color='r', label='Validation set') ax2.plot(X, lin_mse_train, label='Training set') ax2.set_ylabel('MSE') ax2.set_xlabel('Number of Features') ax2.plot(np.argmin(lin_mse_val) + 4, min(lin_mse_val), '*', label='minimum', color='b', markersize=15) ax2.legend(loc='best') plt.show() ############Ridge Regression############### print('##############Ridge Regression(without CV)###############') X = np.linspace(-3, 1, 30) cols = abs(corrmat).nlargest(19, 'Life_expectancy')[ 'Life_expectancy'].index # Select 7 features that are the most related to life expectancy related_col = cols.drop(['Life_expectancy']) x_train = x_train1[related_col] x_val = x_val1[related_col] rid_mse_val = [] rid_mse_train = [] rid_error_val = [] rid_error_train = [] for i in X: ridge = linear_model.Ridge(alpha=10 ** i, normalize=True) ridge.fit(x_train, y_train) ridge_predict_val = ridge.predict(x_val) ridge_predict_train = ridge.predict(x_train) ridge_acc_val = ridge.score(x_val, y_val) ridge_acc_train = ridge.score(x_train, y_train) rid_mse_val = np.append(rid_mse_val, mean_squared_error(y_val, ridge_predict_val)) rid_mse_train = np.append(rid_mse_train, mean_squared_error(y_train, ridge_predict_train)) rid_error_val = np.append(rid_error_val, 1 - ridge_acc_val) rid_error_train = np.append(rid_error_train, 1 - ridge_acc_train) # print('error rate in validation set =') # print(rid_error_val) # print('MSE in validation set') # print(rid_mse_val) print('We can find the min error rate= %.4f and min MSE= %.4f when alpha= %.6f ' % ( min(rid_error_val), min(rid_mse_val), 10 ** X[np.argmin(rid_mse_val)])) # print('alpha=',X) fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1) fig.suptitle('Ridge Regression (Without CV)') ax1.plot(X, rid_error_train, label='Training set') ax1.plot(X, rid_error_val, color='r', label='Validation set') ax1.set_ylabel('Error Rate') ax1.set_xlabel('log(alpha)') ax1.plot(X[np.argmin(rid_error_val)], min(rid_error_val), '*', label='minimum', color='b', markersize=15) ax1.legend(loc='best') ax2.plot(X, rid_mse_train, label='Training set') ax2.plot(X, rid_mse_val, color='r', label='Validation set') ax2.set_ylabel('MSE') ax2.set_xlabel('log(alpha)') ax2.plot(X[np.argmin(rid_mse_val)], min(rid_mse_val), '*', label='minimum', color='b', markersize=15) ax2.legend(loc='best') plt.show() print('##############Ridge Regression(with CV)###############') X = np.linspace(-3, 1, 30) cols = abs(corrmat).nlargest(19, 'Life_expectancy')[ 'Life_expectancy'].index # Select 7 features that are the most related to life expectancy related_col = cols.drop(['Life_expectancy']).drop(['Status']) #print('in ridge regression') #print('related col=', related_col) xx = x1[related_col] ridCV_err = np.zeros([len(X), 6]) ridCV_mse = np.zeros([len(X), 6]) for i in range(5, 11): # column kfold = KFold(n_splits=i, shuffle=True) for j in range(len(X)): # row ridgeCV = linear_model.RidgeCV(alphas=10 ** X, normalize=True) ridCV_neg_mse = cross_val_score(ridgeCV, xx, y, cv=kfold, scoring='neg_mean_squared_error') ridCV_score = cross_val_score(ridgeCV, xx, y, cv=kfold, scoring='r2') # ridCV_err[j][i-5] = 1- np.mean(ridCV_score) ridCV_mse[j][i - 5] = np.mean(ridCV_neg_mse) * (-1) min_err_index = np.unravel_index(ridCV_err.argmin(), ridCV_err.shape) min_mse_index = np.unravel_index(ridCV_mse.argmin(), ridCV_mse.shape) print('When we use Ridge Regression with cross validation') print('We got the min MSE value= %.3f when we applied %.0f fold and alpha = %.5f' % ( ridCV_mse.min(), min_mse_index[1] + 5, 10 ** X[min_mse_index[0]])) print('') bestK_alpha_mse = ridCV_mse[:, min_mse_index[1]].reshape((ridCV_mse[:, min_mse_index[1]].shape[0], 1)) # bestK_alpha_err = ridCV_err[:,min_err_index[1]].reshape((ridCV_err[:,min_err_index[1]].shape[0],1)) fig, ax2 = plt.subplots(nrows=1, ncols=1) fig.suptitle('Ridge Regression (With CV when K= %.0f)' % (min_mse_index[1] + 5)) # ax1.plot(X, bestK_alpha_err) ax1.set_ylabel('Error Rate') ax1.set_xlabel('log(alpha)') # ax1.plot(X[min_err_index[0]],bestK_alpha_err.min(),'*', label='minimum',color='b',markersize=15) ax1.legend(loc='best') ax2.plot(X, bestK_alpha_mse) ax2.set_ylabel('MSE') ax2.set_xlabel('log(alpha)') ax2.plot(X[min_mse_index[0]], bestK_alpha_mse.min(), '*', label='minimum', color='b', markersize=15) ax2.legend(loc='best') # plt.show() ###########Lasso Regression############### print('##############Lasso Regression(without CV)###############') X = np.linspace(-3, 1, 30) x_train = x_train1[related_col] x_val = x_val1[related_col] lasso_mse_val = [] lasso_mse_train = [] lasso_error_val = [] lasso_error_train = [] for i in X: lasso = linear_model.Lasso(alpha=10 ** i, normalize=True) lasso.fit(x_train, y_train) lasso_predict_val = lasso.predict(x_val) lasso_predict_train = lasso.predict(x_train) lasso_acc_val = lasso.score(x_val, y_val) lasso_acc_train = lasso.score(x_train, y_train) lasso_mse_val = np.append(lasso_mse_val, mean_squared_error(y_val, lasso_predict_val)) lasso_mse_train = np.append(lasso_mse_train, mean_squared_error(y_train, lasso_predict_train)) lasso_error_val = np.append(lasso_error_val, 1 - lasso_acc_val) lasso_error_train = np.append(lasso_error_train, 1 - lasso_acc_train) # print('error rate in validation set =') # print(lasso_error_val) # print('MSE in validation set') # print(lasso_mse_val) print('We can find the min error rate= %.4f and min MSE= %.4f when alpha= %.6f ' % ( min(lasso_error_val), min(lasso_mse_val), 10 ** X[np.argmin(lasso_mse_val)])) fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1) fig.suptitle('Lasso Regression (Without CV)') ax1.plot(X, lasso_error_train, label='Training set') ax1.plot(X, lasso_error_val, color='r', label='Validation set') ax1.set_ylabel('Error Rate') ax1.plot(X[np.argmin(lasso_error_val)], min(lasso_error_val), '*', label='minimum', color='b', markersize=15) ax1.legend(loc='lower right') ax2.plot(X, lasso_mse_train, label='Training set') ax2.plot(X, lasso_mse_val, color='r', label='Validation set') ax2.set_ylabel('MSE') ax2.set_xlabel('log(alpha)') ax2.plot(X[np.argmin(lasso_mse_val)], min(lasso_mse_val), '*', label='minimum', color='b', markersize=15) plt.show() print('##############Lasso Regression(with CV)###############') X = np.linspace(-3, 1, 30) xx = x1[related_col] lassoCV_err = np.zeros([len(X), 6]) lassoCV_mse = np.zeros([len(X), 6]) for i in range(5, 11): # column kfold = KFold(n_splits=i, shuffle=True) for j in range(len(X)): # row lassoCV = linear_model.LassoCV(alphas=10 ** X, normalize=True) lassoCV_neg_mse = cross_val_score(lassoCV, xx, y, cv=kfold, scoring='neg_mean_squared_error') lassoCV_score = cross_val_score(lassoCV, xx, y, cv=kfold, scoring='r2') # lassoCV_err[j][i-5] = 1- np.mean(lassoCV_score) lassoCV_mse[j][i - 5] = np.mean(lassoCV_neg_mse) * (-1) # min_err_index=np.unravel_index(lassoCV_err.argmin(), lassoCV_err.shape) min_mse_index = np.unravel_index(lassoCV_mse.argmin(), lassoCV_mse.shape) print('When we use Lasso Regression with cross validation') print('We got the min MSE value= %.3f when we applied %.0f fold and alpha = %.5f' % ( lassoCV_mse.min(), min_mse_index[1] + 5, 10 ** X[min_mse_index[0]])) bestK_alpha_mse = lassoCV_mse[:, min_mse_index[1]].reshape((lassoCV_mse[:, min_mse_index[1]].shape[0], 1)) # bestK_alpha_err = lassoCV_err[:,min_err_index[1]].reshape((lassoCV_err[:,min_err_index[1]].shape[0],1)) fig, ax2 = plt.subplots(nrows=1, ncols=1) fig.suptitle('Lasso Regression (With CV when K= %.0f)' % (min_mse_index[1] + 5)) # ax1.plot(X, bestK_alpha_err) ax1.set_ylabel('Error Rate') ax1.set_xlabel('log(alpha)') # ax1.plot(X[min_err_index[0]],bestK_alpha_err.min(),'*', label='minimum',color='b',markersize=15) ax1.legend(loc='best') ax2.plot(X, bestK_alpha_mse) ax2.set_ylabel('MSE') ax2.set_xlabel('log(alpha)') ax2.plot(X[min_mse_index[0]], bestK_alpha_mse.min(), '*', label='minimum', color='b', markersize=15) ax2.legend(loc='best') # plt.show() avg_ytest = np.mean(y_test) one_array = np.ones([len(y_test), 1]) mean_arr = avg_ytest['Life_expectancy'] * one_array baseline_mse = mean_squared_error(y_test, mean_arr) baseline_err = 1 - r2_score(y_test, mean_arr) print('Number of training data (original):', len(y)) print('Number of test data:', len(y_test)) print('Number of training data (New):', len(y_train)) print('Number of validation data:', len(y_val)) print('###################Final model: Random Forest Regression########################') cols = abs(corrmat).nlargest(21, 'Life_expectancy')['Life_expectancy'].index related_col = cols.drop(['Life_expectancy']).drop(['Status']).drop(['Hepatitis_B']).drop(['infant_deaths']).drop( ['GDP']).drop(['Measles']).drop(['Population']).drop(['percentage_expenditure']).drop(['Diphtheria']) error_x = [] error_test = [] mse_x = [] mse_test = [] oob_error = [] for j in range(10): pre_train, pre_X_train_pick, pre_y_train, pre_y_train_pick = train_test_split(x, y, test_size=1 / 3) RF = RandomForestRegressor(n_estimators=38, bootstrap=True, random_state=0, oob_score=True) RF.fit(pre_X_train_pick, pre_y_train_pick.values.ravel()) predict_x = RF.predict(x) predict_test = RF.predict(x_test) acc_x = RF.score(x, y) acc_test = RF.score(x_test, y_test) error_x = np.append(error_x, 1 - acc_x) error_test = np.append(error_test, 1 - acc_test) oob_error = np.append(oob_error, 1 - RF.oob_score_) mse_x = np.append(mse_x, mean_squared_error(predict_x, y)) mse_test = np.append(mse_test, mean_squared_error(predict_test, y_test)) mean_oob_err = np.mean(oob_error) meanerror_x = np.mean(error_x) meanerror_test = np.mean(error_test) mean_mse_x = np.mean(mse_x) mean_mse_test = np.mean(mse_test) var_mse_test = np.var(mse_test) var_err_test = np.var(error_test) print('In our final model (38 trees):') print('In the whole training data') print('Mean MSE =', mean_mse_x) print('Mean Error Rate =', meanerror_x) print('In the test data') print('Mean MSE = %.3f with variance = %.3f ' % (mean_mse_test, var_mse_test)) print('Mean Error Rate = %.5f with variance = %.5f ' % (meanerror_test, var_err_test)) print('Out Of Sample Error = ', mean_oob_err) print('####################Baseline######################') print('The baseline for the test data:') print('MSE = ', baseline_mse) print('Error Rate=', baseline_err) #Draw the 2D plot feature = ['HIV_AIDS', 'Income_composition_of_resources', 'Adult_Mortality', 'Schooling'] for i in feature: for j in range(10): pre_train, pre_X_train_pick, pre_y_train, pre_y_train_pick = train_test_split(x, y, test_size=1 / 3) RF = RandomForestRegressor(n_estimators=38, bootstrap=True, random_state=0) RF.fit(pre_X_train_pick[[i]], pre_y_train_pick.values.ravel()) predict_x = RF.predict(x[[i]]) predict_test = RF.predict(x_test[[i]]) X_grid = np.arange(min(x[i]), max(x[i]), 0.001) # reshape for reshaping the data into a len(X_grid)*1 array, # i.e. to make a column out of the X_grid value X_grid = X_grid.reshape((len(X_grid), 1)) # Scatter plot for original data plt.scatter(x[i], y, color='blue', label='training data points') # plot predicted data plt.plot(X_grid, RF.predict(X_grid), color='green', label='regression function') plt.title('Random Forest Regression') plt.xlabel(i) plt.ylabel('Life expectancy') plt.legend(loc='best') plt.show()
pylab.ylabel("Ridge_error_Tst") pylab.figure(4) pylab.plot([0, 0.1, 1, 10, 100, 1000], lasso_error_Tst) pylab.xlabel("lambda") pylab.ylabel("lasso_error_Tst") pylab.show() # running Cross Validation for Ridge and lasso and extracting the best fitted lambda/alpha R_Trn = linear_model.RidgeCV(fit_intercept=False, cv=5) Ridge_Trn = R_Trn.fit(var_Trn, price_Trn) Ridge_Trn.alpha = Ridge_Trn.alpha_ print "The best alpha for Ridge_train is: \n", Ridge_Trn.alpha, "\n" l_Trn = linear_model.LassoCV(fit_intercept=False, cv=5) lasso_Trn = l_Trn.fit(var_Trn, price_Trn) lasso_Trn.alpha = lasso_Trn.alpha_ print "The best alpha for lasso_train is: \n", lasso_Trn.alpha, "\n" # extracting the w using the best fitted lambda for both Ridge and lasso R_Trn = linear_model.Ridge(alpha=Ridge_Trn.alpha, fit_intercept=False) Ridge_Trn = R_Trn.fit(var_Trn, price_Trn) print "The best fitted coefs for Ridge_train is: \n", Ridge_Trn.coef_, "\n" l_Trn = linear_model.Lasso(alpha=lasso_Trn.alpha, fit_intercept=False) lasso_Trn = l_Trn.fit(var_Trn, price_Trn) print "The best fitted coefs for lasso_train is: \n", lasso_Trn.coef_, "\n" # fitting obtained model to test dataset and mesuring errors R_Tst = linear_model.Ridge(alpha=Ridge_Trn.alpha, fit_intercept=False)
def regression(XTrain, betaTrain, XTest): model = linear_model.LassoCV( cv=10, alphas=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]) model.fit(XTrain, betaTrain) Beta = model.predict(XTest) return [i for i in Beta]
def model_selection(): # This is to avoid division by zero while doing np.log10 EPSILON = 1e-5 # ############################################################################# # LassoLarsIC: least angle regression with BIC/AIC criterion model_bic = linear_model.LassoLarsIC(criterion='bic') model_bic.fit(data, label) # alpha_bic_ = model_bic.alpha_ model_aic = linear_model.LassoLarsIC(criterion='aic') model_aic.fit(data, label) # alpha_aic_ = model_aic.alpha_ plt.figure() plot_ic_criterion(model_aic, 'AIC', 'b', EPSILON) plot_ic_criterion(model_bic, 'BIC', 'r', EPSILON) plt.legend() plt.title('Information-criterion for model selection') plt.savefig('information_criterion_model_selection.png') # ############################################################################# # LassoCV: coordinate descent # Compute paths model = linear_model.LassoCV(cv=10).fit(data, label) # Display results m_log_alphas = -np.log10(model.alphas_ + EPSILON) plt.figure() ymin, ymax = 20, 300 plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_ + EPSILON), linestyle='--', color='k', label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent ') plt.axis('tight') plt.ylim(ymin, ymax) plt.savefig('lasso_model_selection.png') # ############################################################################# # LassoLarsCV: least angle regression # Compute paths model = linear_model.LassoLarsCV(cv=10).fit(data, label) # Display results m_log_alphas = -np.log10(model.cv_alphas_ + EPSILON) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: Lars') plt.axis('tight') plt.ylim(ymin, ymax) plt.savefig('lasso_Lars_model_selection.png')
), ]) X, y = make_xy_data('./data/merged_data.csv', ['surface_m2', 'piece']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2) X_tr = features.fit_transform(X_train, None) X_te = features.transform(X_test) ############################################################################### t1 = time.time() model = lm.LassoCV(cv=20, verbose=2).fit(X_tr, y_train) t = time.time() - t1 # Display results m_log_alphas = -np.log10(model.alphas_) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent' ' (train time: %.2fs)' % t)
# features is the cols - 1 (the 1 is the output label) numFeatures = dataframe.shape[1] - 1 print(numFeatures) X = dataframe[features].values Y = dataframe[output_label] # prepare configuration for cross validation test harness num_folds = 10 seed = 7 # prepare models models = [] models.append(('LR', LinearRegression())) models.append(('Ridge', Ridge())) #models.append(('ARDRegression', linear_model.ARDRegression())) models.append(('Lasso', linear_model.Lasso())) models.append(('LassoCV', linear_model.LassoCV())) models.append(('LassoLars', linear_model.LassoLars())) # Decision tree models.append(('Dec tree', tree.DecisionTreeRegressor())) # sanity check models.append(('Dummy', DummyRegressor("median"))) def keras_baseline_model(): # create model model = Sequential() model.add( Dense(128, input_dim=numFeatures, init='normal', activation='relu')) model.add(Dense(1, init='normal', activation="relu")) # Compile model
def _get_cv_model(self, alphas=None, kfold=None, l1_ratio=None, **kargs): return linear_model.LassoCV(alphas=alphas, cv=kfold, **kargs)
# load data #path=#pwd#'path to the file/' df=pd.read_csv('ex1data2.txt',header=None) df.columns=['Size','Bedrooms','Price'] # rename columns ## Inputs (X) and labels (y) (Population and profit in restaurent business) y=np.array(df['Price']) X=np.array(df.drop(['Price'],1)) X=X.astype('float64') Sscaler=preprocessing.StandardScaler() Xs=Sscaler.fit_transform(X) # Robust scaler is very helpful in handling outliers #Rscaler=preprocessing.RobustScaler() #Xr=Rscaler.fit_transform(X) # linear regression model Lreg=linear_model.LassoCV(eps=0.08,max_iter=400,tol=1e-5) # Lreg.fit(Xs,y) # #print('------ Multivariate Linear Regression------------') print('Accuracy of Linear Regression Model is ',round(Lreg.score(Xs,y)*100,2)) # # predicting price of house with 1650 sq. feet in size with 3 bed rooms Predict1=Lreg.predict(Sscaler.transform(np.reshape([1650,3],(1,-1)))) print('Predicted price of a house with 1650 sq. feet and 3 bed room is $', round(Predict1[0],2))
def __fit(self): X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=0.33, random_state=42) self.lr = linear_model.LassoCV() self.lr.fit(X_train, y_train)
fig1 = plt.figure(figsize=(12, 8)) ax1 = fig1.add_subplot(111) ax1.set_xscale('log') ax1.plot(penalisations, coeffs) plt.xlabel("Penalisations") plt.ylabel("thetas") # Question 5 # Determinitation of the penalisation factor with Cross Validation #lasscv = linear_model.LassoCV(cv=20) lassCV = linear_model.LassoCV(alphas=penalisations, fit_intercept=False, normalize=False) lassCV.fit(X, y) # Question 5b print "Lasso with CV : " print "Penalisation trouvée par CV : " + str(lassCV.alpha_) # Determining the smallest penalisation factor so that all coefficients equal to 0. # Question 5c x_test = np.array([6, 0.3, 0.2, 6, 0.053, 25, 149, 0.9934, 3.24, 0.35, 10]) print "Score : " print lassCV.predict(x_test)
lr.score(XDF.values, y) from sklearn import linear_model as lm r = lm.Ridge().fit(XDF.values, y) get_ipython().run_line_magic('pinfo', 'r.score') r.score(XDF.values, y) r = lm.Ridge(alpha=0.5).fit(XDF.values, y) r.score(XDF.values, y) get_ipython().run_line_magic('pinfo', 'lm.RidgeCV') get_ipython().run_line_magic('pinfo', 'lm.RidgeCV') rcv = lm.RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000], cv=5) rcv.fit(XDF.values, y) rcv rcv.score(XDF.values, y) get_ipython().set_next_input('lasso = lm.LassoCV') get_ipython().run_line_magic('pinfo', 'lm.LassoCV') lasso = lm.LassoCV(n_jobs=-1, cv=5) lasso.fit(XDF.values, y) lasso.score(XDF.values, y) lasso.coef_ rcv.coef_ ecv = lm.ElasticNetCV() ecv.fit(XDF.values, y) ecv.score(XDF.values, y) from sklearn.feature_selection import RFECV RFECV.head() RFECV lr = lm.LinearRegression() rfecv = RFECV() rfecv = RFECV(lr, cv=5, n_jobs=-1) rfecv.fit(XDF.values, y) rfecv.grid_scores_
df = pd.read_csv("input_data.csv") # pandasでcsvを読込 df = pd.DataFrame(df) # DataFrame形式にする df = df.loc[df["play_time"] >= df["play_time"].median()] # プレイタイム上位半分のデータを選択 data_y = df["probability_6man"] # DataFrameから,yとして使用する列を抽出 drop_idx = ["player_id", "frag_starting", "play_time", "frag_6man", "period", "position", "probability_6man"] # xとして使用しない列を選択 data_x = df.drop(drop_idx, axis=1) # DataFrameから,xとして使用する列を抽出 data_y = np.array(data_y, dtype=float) # numpyのarray形式にすることで演算可能になる data_x = np.array(data_x, dtype=float) data_x = (data_x - data_x.mean(axis=0)) / data_x.std(axis=0) # xを標準化 # ハイパーパラメータを交差確認法により推定 # sklearnのlinear_modelにあるLassoCV(Lassoの交差確認法のひとつ。LassoCVの他にもう一つやり方があったんだけど関数名が出てこない……)を、lasso_cvと定義 lasso_cv = linear_model.LassoCV() lasso_cv.fit(data_x, data_y) # data_xとdata_yをlasso_cvにかける # alphaとして交差確認法で算出した値を使用 # lasso = linear_model.Lasso(alpha=lasso_cv.alpha_) lasso = linear_model.Lasso(alpha=0) lasso.fit(data_x, data_y) c = np.array(lasso.coef_) # cは係数のベクトル # 結果の表示 print("交差確認法により推定された適切なハイパーパラメータ alpha :") print(lasso_cv.alpha_) # 交差確認法の結果を表示,ハイパーパラメータalphaを返す print() # 1行空ける print("LASSO により推定されたパラメータ:") print(c) print() # 1行空ける
# ------------------------------------------------- # <editor-fold desc="COMMON VARIABLES"> smoothing_time = 0.2 # The smoothing window for both the firing rates and the distance to poke time series smoothing_frames = int(smoothing_time / 0.00833) fr_final_smoothing_time = 1 # The final smoothing window of the firing rates fr_extra_smoothing_frames = int(fr_final_smoothing_time / smoothing_time) leave_percentage_out = 0.005 model = pipeline.make_pipeline(PolynomialFeatures(2), linear_model.LinearRegression()) model_npb_dtp = pipeline.make_pipeline( PolynomialFeatures(2), linear_model.LassoCV(cv=None, fit_intercept=True)) common_pb_npb_dtp_neuron_indices = np.intersect1d( correlated_neuron_indices['pb_dtp'], correlated_neuron_indices['npb_dtp']) correlated_neuron_indices_unique_pb_dtp = np.delete( correlated_neuron_indices['pb_dtp'], np.argwhere( np.isin(correlated_neuron_indices['pb_dtp'], common_pb_npb_dtp_neuron_indices))) correlated_neuron_indices_unique_npb_dtp = np.delete( correlated_neuron_indices['npb_dtp'], np.argwhere( np.isin(correlated_neuron_indices['npb_dtp'], common_pb_npb_dtp_neuron_indices)))
这里不给出原始公式推导,因为相比较岭回归,LASSO回归的区别不大 二者之间主要是正则化项不相同 在代价函数方面: 岭回归:1/(2*m)[sum(i=1->m)(hθ(xi)-yi)^2]+λ*sum(j=0->n)θj^2 LASSO回归:1/(2*m)[sum(i=1->m)(hθ(xi)-yi)^2]+λ*sum(j=0->n)|θj| LASSO回归具有更强的解释性,他将与其他特征线性相关的特征系数置于0 详细数学细节不予赘述 """ import numpy as np from sklearn import linear_model # 读入数据 data = np.genfromtxt("longley.csv", delimiter=",") x_data = data[1:, 2:] y_data = data[1:, 1] # 训练LASSO model = linear_model.LassoCV() model.fit(x_data, y_data) # lasso 系数 print("Lasso系数: ", model.alpha_) # 相关系数 # 这里打印出的系数有0说明存在多重共线性 print("相关系数: ", model.coef_) # 做一个预测 predict = model.predict(x_data[-2, np.newaxis]) print(predict)
X_test = np.array(X_test) Y_test = [] for y in raw_test: Y_test.append(y['Value']) Y_test = np.array(Y_test, np.double) # train cvParams = [ 0.000003, 0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3 ] clf1 = linear_model.RidgeCV(alphas=cvParams, normalize=True, scoring='mean_squared_error') clf1.fit(X, Y) clf2 = linear_model.LassoCV(alphas=cvParams, normalize=True, max_iter=2000) clf2.fit(X, Y) clf3 = linear_model.ElasticNetCV(max_iter=2000, eps=0.0001) clf3.fit(X, Y) print 'Ridge:', clf1.coef_, clf1.intercept_, clf1.alpha_ print 'Lasso:', clf2.coef_, clf2.intercept_, clf2.alpha_, np.min( clf2.mse_path_) print 'ElasticNet:', clf3.coef_, clf3.intercept_, clf3.alpha_, np.min( clf3.mse_path_) # test print 'Ridge:', np.mean((clf1.predict(X_test) - Y_test)**2) print 'Lasso:', np.mean((clf2.predict(X_test) - Y_test)**2) print 'ElasticNet:', np.mean((clf3.predict(X_test) - Y_test)**2) # plot