def __init__(self): # 알고리즘 이름 self._name = 'larscv' # 기본 경로 self._f_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/regression/resource/regression_sample.csv", sep=",", encoding="utf-8") # 학습 및 테스트 데이터 분리 self._x = (data["year"] <= 2017) self._y = (data["year"] >= 2018) # 학습 데이터 분리 self._x_train, self._y_train = self.preprocessing(data[self._x]) # 테스트 데이터 분리 self._x_test, self._y_test = self.preprocessing(data[self._y]) # 모델 선언 self._model = LarsCV(normalize=False) # 모델 학습 self._model.fit(self._x_train, self._y_train)
def train_regression_model(X, y, model_type='elastic cv', cv=3, extra_params={}): '''Wrapper function to train various regression models with X,y input, where extra params can be passed to override any default parameters''' model_type = model_type.lower() if model_type == 'linear': model = LinearRegression(fit_intercept=True) elif model_type == 'elastic cv': model = ElasticNetCV(cv=cv) elif model_type == 'omp cv': model = OrthogonalMatchingPursuitCV(cv=cv) elif model_type == 'lars cv': model = LarsCV(cv=cv) elif model_type == 'ridge cv': model = RidgeCV(cv=cv) elif model_type == 'full lightgbm': model = Train_Light_GBM(X, y, int_cv=cv, regression=True, **extra_params) return model model.fit(X, y) return model
def larscv(): X, y = make_regression(n_samples=200,n_features=10, noise=4.0, random_state=0) reg = LarsCV(cv=2).fit(X, y) print(reg.score(X, y) ) print(X[:,0].shape,y.shape) plt.plot(X[:,0], y) plt.scatter(X[:,0], y) plt.show()
def test_model_lars_cv(self): model, X = fit_regression_model(LarsCV()) model_onnx = convert_sklearn( model, "lars", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, basename="SklearnLarsCV-Dec4")
def get_model_by_name(model_name): return { 'Linear Regression': LinearRegression(), 'Lars CV': LarsCV(cv=10), 'Lasso CV': LassoCV(cv=10), 'Ridge CV': RidgeCV(cv=10), 'Elastic Net CV': ElasticNetCV(cv=10), 'Orthogonal Matching Pursuit CV': OrthogonalMatchingPursuitCV(cv=10), 'Decision Tree Regressor': DecisionTreeRegressor(max_depth=3), }[model_name]
def Lars_regression(self, X_train, y_train, X_test, y_test): my_cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42) best_model = LarsCV(cv=my_cv, n_jobs=-1) best_model.fit(X_train, y_train) y_pred = best_model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) return best_model, mse, mae, r2
def ResultsLARS(DataSet, Y): X_train, X_test, y_train, y_test = train_test_split(DataSet, Y, train_size=0.75) LAR_cv = LarsCV(normalize=True) LAR_model = LAR_cv.fit(X_train, y_train) LAR_prediction = LAR_model.predict(X_test) LAR_mae = np.mean(np.abs(y_test - LAR_prediction)) LAR_coefs = dict( zip(['Intercept'] + DataSet.columns.tolist(), np.round( np.concatenate((LAR_model.intercept_, LAR_model.coef_), axis=None), 3))) print('Least Angle Regression MAE: {}'.format(np.round(LAR_mae, 3))) print('Least Angle Regression coefficients:{}'.format(LAR_coefs)) del LAR_coefs['Intercept'] DictionaryPlot(LAR_coefs, 'Least Angle Regression')
def _larscv(*, train, test, x_predict=None, metrics, fit_intercept=True, verbose=False, max_iter=500, normalize=True, precompute='auto', cv=None, max_n_alphas=1000, n_jobs=None, eps=2.220446049250313e-16, copy_X=True): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LarsCV.html#sklearn.linear_model.LarsCV """ model = LarsCV(fit_intercept=fit_intercept, verbose=verbose, max_iter=max_iter, normalize=normalize, precompute=precompute, cv=cv, max_n_alphas=max_n_alphas, n_jobs=n_jobs, eps=eps, copy_X=copy_X) model.fit(train[0], train[1]) model_name = 'LarsCV' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def train_regression_model(X, y, model_type='elastic', cv=3): if model_type == 'linear': model = LinearRegression(fit_intercept=True) elif model_type == 'elastic cv': model = ElasticNetCV(cv=cv) elif model_type == 'omp cv': model = OrthogonalMatchingPursuitCV(cv=cv) elif model_type == 'lars cv': model = LarsCV(cv=cv) elif model_type == 'ridge cv': model = RidgeCV(cv=cv) elif model_type == 'simple xgboost': model = XGBRegressor() elif model_type == 'simple lightgbm': model = LGBMRegressor() elif model_type == 'full lightgbm': model = train_light_gbm_regressor(X, y, cv, n_params=10, test_size=.2) return model model.fit(X, y) return model
def fit_linear_model(basis_matrix, train_vals, solver_type, **kwargs): solvers = { 'lasso_lars': LassoLarsCV(cv=kwargs['cv']).fit, 'lasso': LassoCV(cv=kwargs['cv']).fit, 'lars': LarsCV(cv=kwargs['cv']).fit, 'omp': OrthogonalMatchingPursuitCV(cv=kwargs['cv'], verbose=5).fit } assert train_vals.ndim == 2 if solver_type in solvers: fit = solvers[solver_type] res = fit(basis_matrix, train_vals[:, 0]) else: msg = f'Solver type {solver_type} not supported\n' msg += 'Supported solvers are:\n' for key in solvers.keys(): msg += f'\t{key}\n' raise Exception(msg) cv_score = res.score(basis_matrix, train_vals[:, 0]) coef = res.coef_[:, np.newaxis] coef[0] = res.intercept_ return coef, cv_score
def check_w(w=[12, 24, 36, 48, 60]): ''' robustness check for w_min, save the prediction results (Avew window) and OOS R_square Parameters ---------- w: possible w_min (list) ''' for w_min in w: #linear ML prediction pre1 = linear_prediction(RidgeCV(), w_min=w_min, window_type="Avew") pre2 = linear_prediction(LassoCV(cv=5), w_min=w_min, window_type="Avew") pre3 = linear_prediction(ElasticNetCV(cv=5), w_min=w_min, window_type="Avew") pre4 = linear_prediction(LarsCV(cv=5), w_min=w_min, window_type="Avew") pre5 = linear_prediction(OrthogonalMatchingPursuitCV(cv=5), w_min=w_min, window_type="Avew") pre6 = MR(w_min=w_min, window_type="Avew") all_pre = pd.DataFrame({ 'Kintchen Sink': pre6, "ridge": pre1, "lasso": pre2, "elasticnet": pre3, "lars": pre4, "OMP": pre5, }) all_pre['FC'] = all_pre.iloc[:, 1:].mean(axis=1) #save the prediction results all_pre.to_csv( os.path.join(path, "稳健性检验", "w_min", "预测结果", "w_min=" + str(w_min) + ".csv")) #R2 test R2_test(all_pre, name="w_min=" + str(w_min) + ".csv") #then you need move the result on your own
def ridge_regression(self, **kwargs): if self._regression_type == 'lasso': self.ridgereg = LassoCV(max_iter=50000) #self.ridgereg = LassoCV(max_iter=1e5, cv=10) self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'ard': self.ridgereg = ARDRegression() self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'elastic': self.ridgereg = ElasticNetCV(cv=10) self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'lars': self.ridgereg = LarsCV(cv=10) self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'lassolars': self.ridgereg = LassoLarsCV(cv=5) self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'ordinary': self.ridgereg = LinearRegression() self.ridgereg.fit(self.data, self.Y) elif self._regression_type == 'ridge': self.ridgereg = RidgeCV() self.ridgereg.fit(self.data, self.Y)
data['y']) # data.iloc[:, 0:13] print model.coef_ # 各个特征的系数 print model.intercept_ # In[4]: from sklearn.linear_model import Lars #最小角回归 model1 = Lars(n_nonzero_coefs=7) model1.fit(data.iloc[:, 0:13], data['y']) print model1.coef_ # 各个特征的系数 # In[5]: # 确定最合适的Alpha from sklearn.linear_model import LarsCV #交叉验证最小二乘法回归模型 model1 = LarsCV() model1.fit(data.iloc[:, 0:13], data['y']) print model1.coef_ # 各个特征的系数 print model1.alpha_ # In[6]: from sklearn.linear_model import LassoCV #交叉验证最小二乘法回归模型 model1 = LassoCV() model1.fit(data.iloc[:, 0:13], data['y']) print model1.coef_ # 各个特征的系数 print model1.alpha_ # In[8]: from sklearn.linear_model import Lasso # AdaptiveLasso找不到
'PLSRegression 2D', make_pipeline( StandardScaler(), PCA(n_components=0.95), PolynomialFeatures(2, interaction_only=True, include_bias=True), PLSRegression()))) models.append( ModelClass('LinearRegressor', make_pipeline(StandardScaler(), LinearRegression()))) models.append( ModelClass('HuberRegressor', make_pipeline(StandardScaler(), HuberRegressor()))) models.append( ModelClass('Lars', make_pipeline(LarsCV(cv=cv_inner, normalize=True)))) models.append( ModelClass('LassoLarsCV', LassoLarsCV(cv=cv_inner, normalize=True))) models.append(ModelClass('LassoLarsIC', make_pipeline(LassoLarsIC()))) models.append( ModelClass('BayesianRidge', make_pipeline(StandardScaler(), BayesianRidge()))) models.append( ModelClass( 'ElasticNet kBest std', make_pipeline( StandardScaler(), SelectKBest(mutual_info_regression, k=6),
np.round( np.concatenate( (elastic_net_model.intercept_, elastic_net_model.coef_), axis=None), 3))) print('Elastic Net MSE: {}'.format(np.round(elastic_net_mae, 3))) print('Elastic Net coefficients:', elastic_net_coefs) ############################################################################## ###################### LEAST ANGLE REGRESSION ################################ ############################################################################## print( "##############################################################################" ) print("LEAST ANGLE REGRESSION") LAR_cv = LarsCV(normalize=True) LAR_model = LAR_cv.fit(X_train, y_train) LAR_prediction = LAR_model.predict(X_test) LAR_mae = mean_squared_error(y_test, LAR_prediction) LAR_coefs = dict( zip(['Intercept'] + data.columns.tolist()[:-1], np.round( np.concatenate((LAR_model.intercept_, LAR_model.coef_), axis=None), 3))) print('Least Angle Regression MSE: {}'.format(np.round(LAR_mae, 3))) print('Least Angle Regression coefficients:', LAR_coefs) ############################################################################## ################## PRINCIPAL COMPONENTS REGRESSION ########################### ##############################################################################
classifiers = [ SVC(kernel="rbf", probability=True), SVC(kernel='linear', probability=True), SVC(kernel='sigmoid', probability=True), SVC(kernel='poly', probability=True, degree=3), SVC(kernel='poly', probability=True, degree=4), SVC(kernel='poly', probability=True, degree=5), DecisionTreeClassifier(), KNeighborsClassifier(), GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), QuadraticDiscriminantAnalysis(), LinearDiscriminantAnalysis(), ElasticNetCV(max_iter=10000), LarsCV(), LassoCV(max_iter=10000), LassoLarsCV(), LogisticRegressionCV(scoring=multi_class_log_loss), MultiTaskElasticNetCV(), MultiTaskLassoCV(), OrthogonalMatchingPursuitCV(), RidgeClassifierCV() ] algorithm = 17 if len(sys.argv) > 1: algorithm = int(sys.argv[1]) name = names[algorithm] clf = classifiers[algorithm] output_file_name = output_file_names[algorithm] + file_identifier
def default_model_create(self, x, y): self.model = LarsCV(cv=self.cv) return True
def __init__(self, y, x, saz=False, work_days=False, country=None, transf=None): index = x.index k = x.shape[1] if freq_df(x) == 'M': self.frequency = 12 elif freq_df(x) == 'Q': self.frequency = 4 if saz: # Inclui as dummies mensais para as regressões d_months = pd.get_dummies(index.month, prefix='D_M', prefix_sep='') d_months.index = index x = pd.concat([x, d_months], axis=1) if work_days == True: # Inclui a variavel de dias uteis wd = workdays_series(country, transf=transf) x = pd.concat([x, wd], axis=1, join='inner') fit_int = False norm_X = False parallel = 1 cv_method = TimeSeriesSplit(self.frequency) el_net_l1_ratio = [.1, .5, .7, .9, .95, .99, 1] bag_n_estimators = [5, 10, 20, 50] adab_n_estimators = [10, 50, 100] adab_learn_rate = [0.1, 0.2, 0.5] arima_n_models = 50 self.y = y self.x = x self.ar_elem = self.__check_ar_elem() models_par = {} models_npar = {} models_par['mlp_reg'] = GridSearchCV( make_pipeline(StandardScaler(), MLPRegressor()), param_grid={ 'mlpregressor__hidden_layer_sizes': [(round(.66 * k), round(.33 * k)), (round(.75 * k), round(.50 * k), round(.25 * k)), (round(.80 * k), round(.60 * k), round(.40 * k), round(.20 * k))] }, cv=cv_method, refit=True, n_jobs=parallel) models_par['gp_reg'] = GridSearchCV( make_pipeline(StandardScaler(), GaussianProcessRegressor(normalize_y=norm_X)), param_grid={ 'gaussianprocessregressor__kernel': [ WhiteKernel(), ConstantKernel(), RBF(), Matern(), RationalQuadratic(), DotProduct() ] }, cv=cv_method, refit=True, n_jobs=parallel) models_par['ridgecv'] = make_pipeline( StandardScaler(), RidgeCV(fit_intercept=fit_int, normalize=norm_X, cv=cv_method)) models_par['bay_rid'] = make_pipeline( StandardScaler(), BayesianRidge(fit_intercept=fit_int, normalize=norm_X)) models_par['lassocv'] = make_pipeline( StandardScaler(), LassoCV(fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel, cv=cv_method)) models_par['laslrscv'] = make_pipeline( StandardScaler(), LassoLarsCV(fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel, cv=cv_method)) models_par['larscv'] = make_pipeline( StandardScaler(), LarsCV(fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel, cv=cv_method)) models_par['elasnet'] = make_pipeline( StandardScaler(), ElasticNetCV(l1_ratio=el_net_l1_ratio, fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel, cv=cv_method)) models_par['hub_reg'] = GridSearchCV( make_pipeline(StandardScaler(), HuberRegressor(fit_intercept=fit_int)), param_grid={ 'huberregressor__epsilon': [1.1, 1.2, 1.35], 'huberregressor__alpha': [0.0001, 0.01, 0.1, 0.3] }, cv=cv_method, refit=True, n_jobs=parallel) models_par['ort_purs'] = make_pipeline( StandardScaler(), OrthogonalMatchingPursuitCV(fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel, cv=cv_method)) models_par['ard_reg'] = make_pipeline( StandardScaler(), ARDRegression(fit_intercept=fit_int, normalize=norm_X)) models_par['sgd_reg'] = GridSearchCV( make_pipeline(StandardScaler(), SGDRegressor(fit_intercept=fit_int, shuffle=False)), param_grid={ 'sgdregressor__l1_ratio': el_net_l1_ratio, 'sgdregressor__loss': ['squared_loss', 'huber', 'epsilon_insensitive'] }, cv=cv_method, refit=True, n_jobs=parallel) models_par['pas_agg'] = make_pipeline( StandardScaler(), PassiveAggressiveRegressor(fit_intercept=fit_int, shuffle=False)) models_par['lin_all'] = make_pipeline( StandardScaler(), LinearRegression(fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel)) models_par['ols1'] = make_pipeline( StandardScaler(), SelectFromModel(DecisionTreeRegressor(), prefit=False), LinearRegression()) models_par['ols2'] = make_pipeline( StandardScaler(), SelectFromModel(ElasticNetCV(l1_ratio=el_net_l1_ratio, fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel, cv=cv_method), prefit=False), LinearRegression()) models_par['ols3'] = make_pipeline( StandardScaler(), SelectFromModel(LarsCV(fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel, cv=cv_method), prefit=False), LinearRegression()) models_par['ols4'] = make_pipeline( StandardScaler(), SelectFromModel(BayesianRidge(fit_intercept=fit_int, normalize=norm_X), prefit=False), LinearRegression()) models_par['ols5'] = GridSearchCV( make_pipeline(StandardScaler(), PCA(), LinearRegression()), param_grid={'pca__n_components': [1, 2, 3, 4, 5]}, cv=cv_method, refit=True, n_jobs=parallel) models_par['d_tree'] = make_pipeline(StandardScaler(), DecisionTreeRegressor()) models_par['rand_for'] = GridSearchCV( make_pipeline(StandardScaler(), RandomForestRegressor()), param_grid={'randomforestregressor__n_estimators': [10, 50, 100]}, cv=cv_method, refit=True, n_jobs=parallel) models_par['bag1'] = GridSearchCV( make_pipeline(StandardScaler(), BaggingRegressor(max_samples=0.5, max_features=0.5)), param_grid={'baggingregressor__n_estimators': bag_n_estimators}, cv=cv_method, refit=True) models_par['bag2'] = GridSearchCV( make_pipeline( StandardScaler(), BaggingRegressor(LinearRegression(fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel), max_samples=0.5, max_features=0.5)), param_grid={'baggingregressor__n_estimators': bag_n_estimators}, cv=cv_method, refit=True, n_jobs=parallel) models_par['bag3'] = GridSearchCV(make_pipeline( StandardScaler(), BaggingRegressor(PassiveAggressiveRegressor(fit_intercept=fit_int, shuffle=False), max_samples=0.5, max_features=0.5)), param_grid={ 'baggingregressor__n_estimators': bag_n_estimators }, cv=cv_method, refit=True) models_par['bag4'] = GridSearchCV( make_pipeline( StandardScaler(), BaggingRegressor(ARDRegression(fit_intercept=fit_int, normalize=norm_X), max_samples=0.5, max_features=0.5)), param_grid={'baggingregressor__n_estimators': bag_n_estimators}, cv=cv_method, refit=True) models_par['bag5'] = GridSearchCV(make_pipeline( StandardScaler(), BaggingRegressor(OrthogonalMatchingPursuit(fit_intercept=fit_int, normalize=norm_X), max_samples=0.5, max_features=0.5)), param_grid={ 'baggingregressor__n_estimators': bag_n_estimators }, cv=cv_method, refit=True) models_par['ada1'] = GridSearchCV( make_pipeline(StandardScaler(), AdaBoostRegressor()), param_grid={ 'adaboostregressor__n_estimators': adab_n_estimators, 'adaboostregressor__learning_rate': adab_learn_rate }, cv=cv_method, refit=True, n_jobs=parallel) models_par['ada2'] = GridSearchCV( make_pipeline( StandardScaler(), AdaBoostRegressor( LinearRegression(fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel))), param_grid={ 'adaboostregressor__n_estimators': adab_n_estimators, 'adaboostregressor__learning_rate': adab_learn_rate }, cv=cv_method, refit=True, n_jobs=parallel) models_par['ada3'] = GridSearchCV( make_pipeline( StandardScaler(), AdaBoostRegressor( PassiveAggressiveRegressor(fit_intercept=fit_int, shuffle=False))), param_grid={ 'adaboostregressor__n_estimators': adab_n_estimators, 'adaboostregressor__learning_rate': adab_learn_rate }, cv=cv_method, refit=True, n_jobs=parallel) models_par['ada4'] = GridSearchCV( make_pipeline( StandardScaler(), AdaBoostRegressor( ARDRegression(fit_intercept=fit_int, normalize=norm_X))), param_grid={ 'adaboostregressor__n_estimators': adab_n_estimators, 'adaboostregressor__learning_rate': adab_learn_rate }, cv=cv_method, refit=True, n_jobs=parallel) models_par['ada5'] = GridSearchCV( make_pipeline( StandardScaler(), AdaBoostRegressor( OrthogonalMatchingPursuit(fit_intercept=fit_int, normalize=norm_X))), param_grid={ 'adaboostregressor__n_estimators': adab_n_estimators, 'adaboostregressor__learning_rate': adab_learn_rate }, cv=cv_method, refit=True, n_jobs=parallel) models_par['g_boost'] = GridSearchCV( make_pipeline(StandardScaler(), GradientBoostingRegressor()), param_grid={ 'gradientboostingregressor__n_estimators': adab_n_estimators, 'gradientboostingregressor__learning_rate': adab_learn_rate }, cv=cv_method, refit=True, n_jobs=parallel) norm_lin = pipe_transf([('std', StandardScaler()), ('regression', LinearRegression())]) models_par['rfecv'] = RFECV(estimator=norm_lin, step=1, cv=cv_method, scoring='r2') models_npar['arima1'] = make_pipeline( DropFeatures(self.ar_elem), StandardScaler(), SelectFromModel(DecisionTreeRegressor(), prefit=False), S_avg_arima(freq=self.frequency, n_models=arima_n_models)) models_npar['arima2'] = make_pipeline( DropFeatures(self.ar_elem), StandardScaler(), SelectFromModel(ElasticNetCV(l1_ratio=el_net_l1_ratio, n_jobs=parallel, fit_intercept=fit_int, normalize=norm_X, cv=cv_method), prefit=False), S_avg_arima(freq=self.frequency, n_models=arima_n_models)) models_npar['arima3'] = make_pipeline( DropFeatures(self.ar_elem), StandardScaler(), SelectFromModel(LarsCV(fit_intercept=fit_int, normalize=norm_X, n_jobs=parallel, cv=cv_method), prefit=False), S_avg_arima(freq=self.frequency, n_models=arima_n_models)) models_npar['arima4'] = make_pipeline( DropFeatures(self.ar_elem), StandardScaler(), SelectFromModel(BayesianRidge(fit_intercept=fit_int, normalize=norm_X), prefit=False), S_avg_arima(freq=self.frequency, n_models=arima_n_models)) models_npar['arima5'] = S_avg_arima(freq=self.frequency, use_X=False, n_models=arima_n_models) self.models_par = models_par self.models_npar = models_npar
}, { 'name': 'LLCV', 'mdl': LassoLarsCV(max_n_alphas=1000) }, { 'name': 'LLaic', 'mdl': LassoLarsIC(criterion='aic') }, { 'name': 'ENCV', 'mdl': ElasticNetCV(n_alphas=100) }, { 'name': 'LarsCV', 'mdl': LarsCV(max_n_alphas=1000) }, { 'name': 'LR', 'mdl': LinearRegression() }, { 'name': 'ARDR', 'mdl': ARDRegression() }, { 'name': 'BYR', 'mdl': BayesianRidge() }, ]
n_splits = (len(df_general) - 50) models = {} models[1] = make_pipeline( StandardScaler(), SelectFromModel(DecisionTreeRegressor(), prefit=False)).fit(df_general, ibc) models[2] = make_pipeline( StandardScaler(), SelectFromModel(ElasticNetCV(normalize=False, cv=TimeSeriesSplit(n_splits)), prefit=False)).fit(df_general, ibc) models[3] = make_pipeline( StandardScaler(), SelectFromModel(LarsCV(normalize=False, cv=TimeSeriesSplit(n_splits)), prefit=False)).fit(df_general, ibc) models[4] = make_pipeline( StandardScaler(), SelectFromModel(BayesianRidge(normalize=False), prefit=False)).fit(df_general, ibc) models[5] = make_pipeline( StandardScaler(), RFECV(LinearRegression(), cv=TimeSeriesSplit(n_splits))).fit(df_general, ibc) models[6] = make_pipeline(StandardScaler(), SelectKBest(mutual_info_regression, 1)).fit(df_general, ibc) models[7] = make_pipeline(StandardScaler(), SelectKBest(mutual_info_regression, 3)).fit(df_general, ibc)
# Fit feat_selector = feat_selector.fit(X, y) # Print support and ranking print(feat_selector.support_) #False means feature can be eliminated?? (what is the algorithm process? still not sure) print(feat_selector.ranking_) #Ranking print(X.columns) ################## Use LarsCV for hyperparameter optimization (wrapper) #LARS works by starting with one variable, increasing its corresponding coefficient, and when the residual has #correlation with some other variable as much as it does the variable you started with, adding that in #and increasing in the joint least squares direction (find through fitting just those variables??), iterating. #This is a feature selection method because at the end you will find some coefficients are 0. # Instantiate lars_mod = LarsCV(cv=5, normalize=False) # Fit feat_selector = lars_mod.fit(X, y) # Print r-squared score and estimated alpha print(lars_mod.score(X, y)) print(lars_mod.alpha_) ################# Using a RandomForestRegressor for feature selection (Tree-based methods) #The way feature importance is calculated: Create trees. Then take one feature variable, #permute it randomly (shuffle), then rerun the observations through the trees. Calculate #the rate of misclassification. The % increase of misclassification rate gives the feature importance #https://link.springer.com/article/10.1023/A:1010933404324 # Instantiate
store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name) if "Auto" in datasets: build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto") build_auto(ARDRegression(normalize = True), "BayesianARDAuto") build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = False) build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(), "LarsAuto") build_auto(LassoCV(random_state = 13), "LassoAuto") build_auto(LassoLarsCV(), "LassoLarsAuto") build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", num_iteration = 11) build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto") build_auto(OrthogonalMatchingPursuitCV(), "OMPAuto") build_auto(RandomForestRegressor(random_state = 13, min_samples_leaf = 3), "RandomForestAuto", flat = True) build_auto(RidgeCV(), "RidgeAuto") build_auto(TheilSenRegressor(n_subsamples = 15, random_state = 13), "TheilSenAuto") build_auto(OptimalXGBRegressor(objective = "reg:linear", ntree_limit = 31), "XGBAuto", ntree_limit = 31) if "Auto" in datasets: build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto") build_auto(TransformedTargetRegressor(LinearRegression(), func = numpy.log, inverse_func = numpy.exp), "TransformedLinearRegressionAuto")
clf = BaseEstimator() res = explain_weights(clf, vec=vec) assert 'BaseEstimator' in res.error for expl in format_as_all(res, clf): assert 'Error' in expl assert 'BaseEstimator' in expl with pytest.raises(TypeError): explain_weights(clf, unknown_argument=True) @pytest.mark.parametrize(['reg'], [ [ElasticNet(random_state=42)], [ElasticNetCV(random_state=42)], [HuberRegressor()], [Lars()], [LarsCV(max_n_alphas=10)], [Lasso(random_state=42)], [LassoCV(random_state=42)], [LassoLars(alpha=0.01)], [LassoLarsCV(max_n_alphas=10)], [LassoLarsIC()], [OrthogonalMatchingPursuit(n_nonzero_coefs=10)], [OrthogonalMatchingPursuitCV()], [PassiveAggressiveRegressor(C=0.1, random_state=42)], [Ridge(random_state=42)], [RidgeCV()], [SGDRegressor(random_state=42)], [LinearRegression()], [LinearSVR(random_state=42)], [TheilSenRegressor(random_state=42)], ])
def main(): # let's create a folder with a unique name to store results folderName = datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M") + "-regression" if not os.path.exists(folderName): os.makedirs(folderName) # initialize logging common.initialize_logging(folderName) regressorsList = [ # human-designed regressors [ HumanRegressor("y = a_0 + a_1 * x + a_2 * x**2 + a_3 * x**3", map_variables_to_features={"x": 0}), "HumanRegressor" ], [PolynomialRegressor(2), "PolynomialRegressor2"], #[PolynomialRegressor(3), "PolynomialRegressor3"], # keras neural network #[ANNRegressor(epochs=500, batch_size=32, layers=[16,4]), "KerasRegressor8-4"], #[ANNRegressor(epochs=700, batch_size=32, layers=[16,8]), "KerasRegressor16-8"], # cross decomposition [PLSRegression(), "PLSRegression"], # ensemble [AdaBoostRegressor(), "AdaBoostRegressor"], [BaggingRegressor(), "BaggingRegressor"], [BaggingRegressor(n_estimators=100), "BaggingRegressor_100"], [BaggingRegressor(n_estimators=300), "BaggingRegressor_300"], [ExtraTreesRegressor(), "ExtraTreesRegressor"], [GradientBoostingRegressor(), "GradientBoostingRegressor"], [RandomForestRegressor(), "RandomForestRegressor"], [RandomForestRegressor(n_estimators=100), "RandomForestRegressor_100"], [RandomForestRegressor(n_estimators=300), "RandomForestRegressor_300"], # isotonic #[IsotonicRegression(), "IsotonicRegression"], # apparently wants "X" as a 1d array # kernel ridge [KernelRidge(), "KernelRidge"], # linear #[ARDRegression(), "ARDRegression"], # takes too much time to train [BayesianRidge(), "BayesianRidge"], [ElasticNetCV(), "ElasticNetCV"], [LarsCV(), "LarsCV"], [LassoCV(), "LassoCV"], [LinearRegression(), "LinearRegression"], [PassiveAggressiveRegressor(), "PassiveAggressiveRegressor"], # neighbors [KNeighborsRegressor(), "KNeighborsRegressor"], [RadiusNeighborsRegressor(), "RadiusNeighborsRegressor"], # neural networks #[BernoulliRBM(), "BernoulliRBM"], # has a different interface, no "predict" # svm [SVR(), "SVR"], [LinearSVR(), "LinearSVR"], [NuSVR(), "NuSVR"], # tree [DecisionTreeRegressor(), "DecisionTreeRegressor (max depth 10)"], [ExtraTreeRegressor(), "ExtraTreeRegressor"], # generalized additive models [LinearGAM(n_splines=20), "LinearGAM(n_splines=20)"], # gaussian processes [ GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel()), "GaussianProcessRegressor" ], ] X = y = X_train = X_test = y_train = y_test = variablesX = variablesY = None numberOfSplits = 10 # TODO change number of splits from command line if True: # this is just a dumb benchmark X, y, variablesX, variablesY = common.loadEasyBenchmark() if False: X, y, variablesX, variablesY = common.loadChristianQuestionnaireRegression( ) if False: X, y, variablesX, variablesY = common.loadYongShiDataCalibration2( "TIMBER") if False: X, y, variablesX, variablesY = common.loadLaurentBouvierNewData() if False: X, y, variablesX, variablesY = common.loadYongShiDataCalibration() if False: from sklearn.datasets import load_linnerud X, y = load_linnerud(return_X_y=True) if False: X, y, variablesX, variablesY = common.loadYingYingData() if False: X, y, variablesX, variablesY = common.loadCleaningDataGermanSpecific() #X, y, variablesX, variablesY = common.loadCleaningDataGerman() if False: X, y, variablesX, variablesY = common.loadInsects() if False: X, y, variablesX, variablesY = common.loadMilkProcessPipesDimensionalAnalysis( ) #X, y, variablesX, variablesY = common.loadMilkProcessPipes() if False: # ecosystem services X, y, variablesX, variablesY = common.loadEcosystemServices() if False: X, y, variablesX, variablesY = common.loadMarcoSoil() if False: # load dataset X, y = common.loadEureqaRegression() # randomly split between training and test #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) if False: # load dataset X_train, X_test, y_train, y_test = common.loadBiscuitExample() logging.info("X_train: " + str(X_train.shape)) logging.info("X_test: " + str(X_test.shape)) logging.info("y_train: " + str(y_train.shape)) logging.info("y_test: " + str(y_test.shape)) # in this particular case, I create the "global" X and y by putting together the two arrays X = np.append(X_train, X_test, axis=0) y = np.append(y_train, y_test, axis=0) if False: # load dataset X_train, X_test, y_train, y_test = common.loadAromoptiExample() logging.info("X_train: " + str(X_train.shape)) logging.info("X_test: " + str(X_test.shape)) logging.info("y_train: " + str(y_train.shape)) logging.info("y_test: " + str(y_test.shape)) # in this particular case, I create the "global" X and y by putting together the two arrays X = np.append(X_train, X_test, axis=0) y = np.append(y_train, y_test, axis=0) logging.info( "Regressing %d output variables, in function of %d input variables..." % (y.shape[1], X.shape[1])) # if the names of the variables are not specified, let's specify them! if variablesY is None: variablesY = ["y" + str(i) for i in range(0, len(y[0]))] if variablesX is None: variablesX = ["X" + str(i) for i in range(0, len(X[0]))] performances = dict() for variableIndex, variableY in enumerate(variablesY): logging.info("** Now evaluating models for variable \"%s\"... **" % variableY) # obtain data y_ = y[:, variableIndex].ravel() # assume here that you will have train/test indexes instead # it's also easier for the plots, as we do not face the issue # of duplicate values (e.g. same value with two indexes) rs = ShuffleSplit(n_splits=numberOfSplits, random_state=42) #rs = LeaveOneOut() # initialize performance dictionary of arrays performances[variableY] = dict() for regressor, regressorName in regressorsList: performances[variableY][regressorName] = dict() performances[variableY][regressorName]["r^2"] = [] performances[variableY][regressorName]["e.v"] = [] performances[variableY][regressorName]["mse"] = [] performances[variableY][regressorName]["mae"] = [] performances[variableY][regressorName]["predicted"] = [] # this is used to store all values of each fold, in order; maybe there's a smarter way to do it foldPointsInOrder = [] # and now, for every regressor for foldIndex, indexes in enumerate(rs.split(X)): train_index, test_index = indexes X_train = X[train_index] y_train = y_[train_index] X_test = X[test_index] y_test = y_[test_index] # normalize logging.info("Normalizing data...") scalerX = StandardScaler() scalerY = StandardScaler() X_train = scalerX.fit_transform(X_train) X_test = scalerX.transform(X_test) y_train = scalerY.fit_transform(y_train.reshape(-1, 1)).ravel( ) # this "reshape/ravel" here is just to avoid warnings, it has no true effect on data y_test = scalerY.transform(y_test.reshape(-1, 1)).ravel() # now, we store points of the folder in order of how they appear foldPointsInOrder.extend(list(scalerY.inverse_transform(y_test))) for regressorIndex, regressorData in enumerate(regressorsList): regressor = regressorData[0] regressorName = regressorData[1] logging.info("Fold #%d/%d: training regressor #%d/%d \"%s\"" % (foldIndex + 1, numberOfSplits, regressorIndex + 1, len(regressorsList), regressorName)) try: regressor.fit(X_train, y_train) y_test_predicted = regressor.predict(X_test) r2Test = r2_score(y_test, y_test_predicted) mseTest = mean_squared_error(y_test, y_test_predicted) maeTest = mean_absolute_error(y_test, y_test_predicted) varianceTest = explained_variance_score( y_test, y_test_predicted) logging.info("R^2 score (test): %.4f" % r2Test) logging.info("EV score (test): %.4f" % varianceTest) logging.info("MSE score (test): %.4f" % mseTest) logging.info("MAE score (test): %.4f" % maeTest) # add performance to the list of performances performances[variableY][regressorName]["r^2"].append( r2Test) performances[variableY][regressorName]["e.v"].append( varianceTest) performances[variableY][regressorName]["mse"].append( mseTest) performances[variableY][regressorName]["mae"].append( maeTest) # also record the predictions, to be used later in a global figure performances[variableY][regressorName]["predicted"].extend( list(scalerY.inverse_transform(y_test_predicted))) try: import matplotlib.pyplot as plt # plotting first figure, with points 'x' and 'o' y_predicted = regressor.predict(scalerX.transform( X)) # 'X' was never wholly rescaled before y_train_predicted = regressor.predict(X_train) plt.figure() plt.scatter(train_index, y_train, c="gray", label="training data") plt.scatter(test_index, y_test, c="green", label="test data") plt.plot(np.arange(len(y_predicted)), y_predicted, 'x', c="red", label="regression") plt.xlabel("order of data samples") plt.ylabel("target") plt.title(regressorName + ", R^2=%.4f (test)" % r2Test) plt.legend() logging.info("Saving figure...") plt.savefig( os.path.join( folderName, regressorName + "-" + variableY + "-fold-" + str(foldIndex + 1) + ".pdf")) plt.close() # plotting second figure, with everything close to a middle line plt.figure() plt.plot(y_train, y_train_predicted, 'r.', label="training set") # points plt.plot(y_test, y_test_predicted, 'go', label="test set") # points plt.plot([ min(y_train.min(), y_test.min()), max(y_train.max(), y_test.max()) ], [ min(y_train_predicted.min(), y_test_predicted.min()), max(y_train_predicted.max(), y_test_predicted.max()) ], 'k--') # line plt.xlabel("measured") plt.ylabel("predicted") plt.title(regressorName + " measured vs predicted, " + variableY) plt.legend(loc='best') plt.savefig( os.path.join( folderName, regressorName + "-" + variableY + "-fold-" + str(foldIndex + 1) + "-b.pdf")) plt.close() # also, save ordered list of features featuresByImportance = relativeFeatureImportance( regressor) # if list exists, write feature importance to disk # TODO horrible hack here, to avoid issues with GAM if len(featuresByImportance ) > 0 and "GAM" not in regressorName: featureImportanceFileName = regressorName + "-" + variableY + "-featureImportance-fold" + str( foldIndex) + ".csv" with open( os.path.join(folderName, featureImportanceFileName), "w") as fp: fp.write("feature,importance\n") for featureImportance, featureIndex in featuresByImportance: fp.write(variablesX[int(featureIndex)] + "," + str(featureImportance) + "\n") except ImportError: logging.info( "Cannot import matplotlib. Skipping plots...") except Exception as e: logging.info("Regressor \"" + regressorName + "\" failed on variable \"" + variableY + "\":" + str(e)) logging.info("Final summary:") with open(os.path.join(folderName, "00_summary.txt"), "w") as fp: for variableY in variablesY: logging.info("For variable \"" + variableY + "\"") fp.write("For variable: " + variableY + " = f(" + variablesX[0]) for i in range(1, len(variablesX)): fp.write("," + variablesX[i]) fp.write(")\n") # create a list from the dictionary and sort it sortedPerformances = sorted( [(performances[variableY][regressorName], regressorName) for regressorName in performances[variableY]], key=lambda x: np.mean(x[0]["r^2"]), reverse=True) for regressorData in sortedPerformances: regressorName = regressorData[1] regressorScore = regressorData[0] r2Mean = np.mean(regressorScore["r^2"]) r2std = np.std(regressorScore["r^2"]) varianceMean = np.mean(regressorScore["e.v"]) varianceStd = np.std(regressorScore["e.v"]) mseMean = np.mean(regressorScore["mse"]) mseStd = np.std(regressorScore["mse"]) maeMean = np.mean(regressorScore["mae"]) maeStd = np.std(regressorScore["mae"]) logging.info( "\t- %s, R^2=%.4f (std=%.4f), Explained Variance=%.4f (std=%.4f), MSE=%.4f (std=%.4f), MAE=%.4f (std=%.4f)" % (regressorName, r2Mean, r2std, varianceMean, varianceStd, mseMean, mseStd, maeMean, maeStd)) fp.write( "\t- %s, R^2=%.4f (std=%.4f), Explained Variance=%.4f (std=%.4f), MSE=%.4f (std=%.4f), MAE=%.4f (std=%.4f)\n" % (regressorName, r2Mean, r2std, varianceMean, varianceStd, mseMean, mseStd, maeMean, maeStd)) fp.write("\t\t- R^2:" + str(["%.4f" % x for x in regressorScore["r^2"]]) + "\n") fp.write("\t\t- E.V.:" + str(["%.4f" % x for x in regressorScore["e.v"]]) + "\n") fp.write("\t\t- MSE:" + str(["%.4f" % x for x in regressorScore["mse"]]) + "\n") fp.write("\t\t- MAE:" + str(["%.4f" % x for x in regressorScore["mae"]]) + "\n") # also, plot a "global" graph # issue here, if a regressor fails, you have incongruent matrixes: a check is in order # TODO also, the plot looks really bad if some values are negative; turn everything to absolute values? if len(foldPointsInOrder) == len(regressorScore["predicted"]): fig = plt.figure() ax = fig.add_subplot(111) #bottom_left_corner = [min(foldPointsInOrder), max(foldPointsInOrder)] #top_right_corner = [min(regressorScore["predicted"]), max(regressorScore["predicted"])] x_bottom_top = [0, max(foldPointsInOrder)] y_bottom_top = [0, max(foldPointsInOrder)] ax.plot(foldPointsInOrder, regressorScore["predicted"], 'g.') # points ax.plot(x_bottom_top, y_bottom_top, 'k--', label="1:1") # line ax.plot(x_bottom_top, [y_bottom_top[0] * 1.20, y_bottom_top[1] * 1.20], 'r--', label="20% error") ax.plot(x_bottom_top, [y_bottom_top[0] * 0.80, y_bottom_top[1] * 0.80], 'r--') ax.set_title(regressorName + " measured vs predicted, " + variableY + " (all test)") ax.set_xlabel("measured") ax.set_ylabel("predicted") ax.legend(loc='best') plt.savefig( os.path.join( folderName, regressorName + "-" + variableY + "-global-b.png")) plt.close(fig)
# - データを半分に取り分けてLARSモデルで学習 # 変数定義 # --- 訓練データ数 train_n = 100 # インスタンス生成と学習 # --- 非ゼロ係数の数を12個とする lars_12 = Lars(n_nonzero_coefs=12) lars_12.fit(reg_data[:train_n], reg_target[:train_n]) # インスタンス生成と学習 # --- 非ゼロ係数の数を500個とする(デフォルト) lars_500 = Lars(n_nonzero_coefs=500) lars_500.fit(reg_data[:train_n], reg_target[:train_n]) # 平均二乗誤差 np.mean( np.power(reg_target[train_n:] - lars_500.predict(reg_data[train_n:]), 2)) # 3 特徴量選択としてのLARS --------------------------------------------------------------------- # インスタンス生成 lcv = LarsCV() # 学習 lcv.fit(reg_data, reg_target) # 非ゼロの係数 np.sum(lcv.coef_ != 0)
X_learning = df_all_data[:train_index] X_test = df_all_data[train_index:] from sklearn.linear_model import LinearRegression from sklearn.linear_model import LarsCV, Lasso, LassoCV, ElasticNet, ElasticNetCV from sklearn.linear_model import LassoLars, LassoLarsCV, Ridge, RidgeCV from sklearn.model_selection import cross_val_score, KFold, GridSearchCV import xgboost as xgb models = [] models.append(("LrE", LinearRegression())) models.append(("RidCV", RidgeCV())) models.append(("LarCV", LarsCV())) models.append(("LasCV", LassoCV())) models.append(("ElNCV", ElasticNetCV())) models.append(("LaLaCV", LassoLarsCV())) models.append(("XGB", xgb.XGBRegressor())) kfold = KFold(n_splits=10) def getCVResult(models, X_learning, Y_learning): for name, model in models: cv_results = cross_val_score(model, X_learning, Y_learning, scoring='neg_mean_squared_error',
def GetAllModelsForComparison(X_train, Y_train): models = { 'ARDRegression': ARDRegression(), 'BayesianRidge': BayesianRidge(), 'ElasticNet': ElasticNet(), 'ElasticNetCV': ElasticNetCV(), 'Hinge': Hinge(), #'Huber': Huber(), 'HuberRegressor': HuberRegressor(), 'Lars': Lars(), 'LarsCV': LarsCV(), 'Lasso': Lasso(), 'LassoCV': LassoCV(), 'LassoLars': LassoLars(), 'LassoLarsCV': LassoLarsCV(), 'LinearRegression': LinearRegression(), 'Log': Log(), 'LogisticRegression': LogisticRegression(), 'LogisticRegressionCV': LogisticRegressionCV(), 'ModifiedHuber': ModifiedHuber(), 'MultiTaskElasticNet': MultiTaskElasticNet(), 'MultiTaskElasticNetCV': MultiTaskElasticNetCV(), 'MultiTaskLasso': MultiTaskLasso(), 'MultiTaskLassoCV': MultiTaskLassoCV(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), 'Perceptron': Perceptron(), 'RANSACRegressor': RANSACRegressor(), #'RandomizedLasso': RandomizedLasso(), #'RandomizedLogisticRegression': RandomizedLogisticRegression(), 'Ridge': Ridge(), 'RidgeCV': RidgeCV(), 'RidgeClassifier': RidgeClassifier(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), 'SquaredLoss': SquaredLoss(), 'TheilSenRegressor': TheilSenRegressor(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LinearClassifierMixin': LinearClassifierMixin(), 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(), 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(), 'StandardScaler': StandardScaler(), 'TransformerMixin': TransformerMixin(), 'BaseEstimator': BaseEstimator(), 'KernelRidge': KernelRidge(), 'RegressorMixin': RegressorMixin(), 'LinearSVC': LinearSVC(), 'LinearSVR': LinearSVR(), 'NuSVC': NuSVC(), 'NuSVR': NuSVR(), 'OneClassSVM': OneClassSVM(), 'SVC': SVC(), 'SVR': SVR(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), #'BallTree': BallTree(), #'DistanceMetric': DistanceMetric(), #'KDTree': KDTree(), 'KNeighborsClassifier': KNeighborsClassifier(), 'KNeighborsRegressor': KNeighborsRegressor(), 'KernelDensity': KernelDensity(), #'LSHForest': LSHForest(), 'LocalOutlierFactor': LocalOutlierFactor(), 'NearestCentroid': NearestCentroid(), 'NearestNeighbors': NearestNeighbors(), 'RadiusNeighborsClassifier': RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor': RadiusNeighborsRegressor(), #'GaussianProcess': GaussianProcess(), 'GaussianProcessRegressor': GaussianProcessRegressor(), 'GaussianProcessClassifier': GaussianProcessClassifier(), 'CCA': CCA(), 'PLSCanonical': PLSCanonical(), 'PLSRegression': PLSRegression(), 'PLSSVD': PLSSVD(), #'ABCMeta': ABCMeta(), #'BaseDiscreteNB': BaseDiscreteNB(), 'BaseEstimator': BaseEstimator(), #'BaseNB': BaseNB(), 'BernoulliNB': BernoulliNB(), 'ClassifierMixin': ClassifierMixin(), 'GaussianNB': GaussianNB(), 'LabelBinarizer': LabelBinarizer(), 'MultinomialNB': MultinomialNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'DecisionTreeRegressor': DecisionTreeRegressor(), 'ExtraTreeClassifier': ExtraTreeClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'AdaBoostRegressor': AdaBoostRegressor(), 'BaggingClassifier': BaggingClassifier(), 'BaggingRegressor': BaggingRegressor(), #'BaseEnsemble': BaseEnsemble(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'IsolationForest': IsolationForest(), 'RandomForestClassifier': RandomForestClassifier(), 'RandomForestRegressor': RandomForestRegressor(), 'RandomTreesEmbedding': RandomTreesEmbedding(), #'VotingClassifier': VotingClassifier(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LabelBinarizer': LabelBinarizer(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'OneVsOneClassifier': OneVsOneClassifier(), #'OneVsRestClassifier': OneVsRestClassifier(), #'OutputCodeClassifier': OutputCodeClassifier(), 'Parallel': Parallel(), #'ABCMeta': ABCMeta(), 'BaseEstimator': BaseEstimator(), #'ClassifierChain': ClassifierChain(), 'ClassifierMixin': ClassifierMixin(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'MultiOutputClassifier': MultiOutputClassifier(), #'MultiOutputEstimator': MultiOutputEstimator(), #'MultiOutputRegressor': MultiOutputRegressor(), 'Parallel': Parallel(), 'RegressorMixin': RegressorMixin(), 'LabelPropagation': LabelPropagation(), 'LabelSpreading': LabelSpreading(), 'BaseEstimator': BaseEstimator(), 'IsotonicRegression': IsotonicRegression(), 'RegressorMixin': RegressorMixin(), 'TransformerMixin': TransformerMixin(), 'BernoulliRBM': BernoulliRBM(), 'MLPClassifier': MLPClassifier(), 'MLPRegressor': MLPRegressor() } return models
uu = uu[:, :100] X = np.transpose((t_grid.flatten(), x_grid.flatten())) y = uu.reshape((uu.size, 1)) noise_level = 0.0 y_noisy = y + noise_level * np.std(y) * np.random.randn(y[:, 0].size, 1) number_of_samples = 20000 idx = np.random.permutation(y.shape[0]) X_train = torch.tensor(X[idx, :][:number_of_samples], dtype=torch.float32, requires_grad=True) y_train = torch.tensor(y_noisy[idx, :][:number_of_samples], dtype=torch.float32) estimator = LarsCV(fit_intercept=False) config = { 'n_in': 2, 'hidden_dims': [20, 20, 20, 20, 20, 20, 20], 'n_out': 1, 'library_function': library_1D_in, 'library_args': { 'poly_order': 1, 'diff_order': 4 }, 'sparsity_estimator': estimator } model = DeepModDynamic(**config) optimizer = torch.optim.Adam(model.network_parameters(), betas=(0.99, 0.99),
if "Auto" in datasets: build_auto(AdaBoostRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), random_state = 13, n_estimators = 17), "AdaBoostAuto") build_auto(ARDRegression(normalize = True), "BayesianARDAuto") build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAuto", compact = False) build_auto(BaggingRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(cv = 3, random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAuto") build_auto(GBDTLMRegressor(RandomForestRegressor(n_estimators = 7, max_depth = 6, random_state = 13), LinearRegression()), "GBDTLMAuto") build_auto(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 6, random_state = 13), ElasticNet(random_state = 13)), "XGBRFLMAuto") build_auto(GradientBoostingRegressor(init = None, random_state = 13), "GradientBoostingAuto") build_auto(HistGradientBoostingRegressor(max_iter = 31, random_state = 13), "HistGradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(cv = 3), "LarsAuto") build_auto(LassoCV(cv = 3, random_state = 13), "LassoAuto") build_auto(LassoLarsCV(cv = 3), "LassoLarsAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), max_features = 0.75, random_state = 13), "LinearRegressionEnsembleAuto") build_auto(OrthogonalMatchingPursuitCV(cv = 3), "OMPAuto") build_auto(RandomForestRegressor(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAuto", flat = True) build_auto(RidgeCV(), "RidgeAuto") build_auto(StackingRegressor([("ridge", Ridge(random_state = 13)), ("lasso", Lasso(random_state = 13))], final_estimator = GradientBoostingRegressor(n_estimators = 7, random_state = 13)), "StackingEnsembleAuto") build_auto(TheilSenRegressor(n_subsamples = 31, random_state = 13), "TheilSenAuto") build_auto(VotingRegressor([("dt", DecisionTreeRegressor(random_state = 13)), ("knn", KNeighborsRegressor()), ("lr", LinearRegression())], weights = [3, 1, 2]), "VotingEnsembleAuto") build_auto(XGBRFRegressor(n_estimators = 31, max_depth = 6, random_state = 13), "XGBRFAuto") if "Auto" in datasets: build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto") build_auto(TransformedTargetRegressor(LinearRegression(), func = numpy.log, inverse_func = numpy.exp), "TransformedLinearRegressionAuto")
print(regr.alpha_) print(regr.intercept_) plt.scatter(X[:, 0], y, color='black') plt.scatter(X[:, 0], pred, color='red') plt.show() #%% Least Angle Regression LARS: #Lars:fit_intercept, verbose, normalize #LarsCV: fit_intercept, verbose, normalize, cv from sklearn.linear_model import LarsCV, Lars from sklearn.datasets import make_regression import matplotlib.pyplot as plt X, y = make_regression(n_samples=200, noise=4.0, random_state=0) reg = LarsCV(cv=5).fit(X, y) reg.score(X, y) reg.alpha_ pred = reg.predict(X[:, ]) plt.scatter(X[:, 0], y, color='black') plt.scatter(X[:, 0], pred, color='red') plt.show() reg2 = Lars().fit(X, y) reg2.score(X, y) reg2.alpha_ pred = reg2.predict(X[:, ]) #%% LassoLars: alpha, fit_intercept, normalize #LassoLarsCV: alpha, fit_intercept, normalize, cv