def load_h5(file_name="pyfeat_aus_to_landmarks.h5"): """Load the h5 PLS model for plotting. Args: file_name (str, optional): Specify model to load.. Defaults to 'blue.h5'. Returns: model: PLS model """ try: hf = h5py.File(os.path.join(get_resource_path(), file_name), "r") d1 = hf.get("coef") d2 = hf.get("x_mean") d3 = hf.get("y_mean") d4 = hf.get("x_std") model = PLSRegression(len(d1)) model.coef_ = np.array(d1) if int(__version__.split(".")[1]) < 24: model.x_mean_ = np.array(d2) model.y_mean_ = np.array(d3) model.x_std_ = np.array(d4) else: model._x_mean = np.array(d2) model._y_mean = np.array(d3) model._x_std = np.array(d4) hf.close() except Exception as e: print("Unable to load data ", file_name, ":", e) return model
def _load_model(cls, fh): params = _parse_literal(fh) coef_shape = _parse_literal(fh) pls = PLSRegression().set_params(**params) pls.x_mean_ = np.fromstring(fh.read(coef_shape[0] * 8)) pls.y_mean_ = np.fromstring(fh.read(coef_shape[1] * 8)) pls.x_std_ = np.ones(coef_shape[0]) pls.y_std_ = np.ones(coef_shape[1]) n = coef_shape[0] * coef_shape[1] * 8 pls.coef_ = np.fromstring(fh.read(n)).reshape(coef_shape) return pls
def test_same_as_matlab(): """ test that the sMC score is equal to those provided from matlab """ data = sklearn.datasets.load_boston() X = data['data'] y = data['target'] pls = PLSRegression() pls.fit(X,y) smc_mat = loadmat('./validering/values_smc_1_centered.mat')['values'] coef = loadmat('./validering/beta_1_centered')['BETA'] pls.coef_ = coef[1:] # leave the interception out smc = sMC() smc.fit(pls,X) corrects = np.sum(np.round(smc.importances,10) == np.round(smc_mat,10)) assert (corrects==np.shape(X)[1])
def ajustar_pls_letalidad(municipios_df, caracteristicas, min_casos=20, min_defunciones=0): data_train = municipios_df.loc[municipios_df[caracteristicas].notna().all( axis=1)] X = data_train.query( f'(conteo > {min_casos}) & (defunciones > {min_defunciones})' )[caracteristicas] Y = data_train.query( f'(conteo > {min_casos}) & (defunciones > {min_defunciones})' )['tasa_covid_letal'] # X['i_vuln_econo'] = -X['i_vuln_econo'] pls2 = PLSRegression(n_components=1) pls2.fit(X, Y) pls2.coef_ = pls2.coef_.flatten() return pls2
def load_h5(file_name='blue.h5'): """Load the h5 PLS model for plotting. Args: file_name (str, optional): Specify model to load.. Defaults to 'blue.h5'. Returns: model: PLS model """ try: hf = h5py.File(os.path.join(get_resource_path(), file_name), 'r') d1 = hf.get('coef') d2 = hf.get('x_mean') d3 = hf.get('y_mean') d4 = hf.get('x_std') model = PLSRegression(len(d1)) model.coef_ = np.array(d1) model.x_mean_ = np.array(d2) model.y_mean_ = np.array(d3) model.x_std_ = np.array(d4) hf.close() except Exception as e: print('Unable to load data ', file_name, ':', e) return model
def ajustar_pls_columna(municipios_df, caracteristicas, columna, min_casos=20, min_defunciones=0): data_train = municipios_df.loc[municipios_df[caracteristicas].notna().all( axis=1)] X = data_train.query( f'(conteo > {min_casos}) & (defunciones > {min_defunciones})' )[caracteristicas] try: Y = data_train.query( f'(conteo > {min_casos}) & (defunciones > {min_defunciones})' )[columna] except KeyError: print(f"No existe la columna {columna}") # X['i_vuln_econo'] = -X['i_vuln_econo'] pls2 = PLSRegression(n_components=1) pls2.fit(X, Y) pls2.coef_ = pls2.coef_.flatten() return pls2
def run_regression_simple_data(data_tr, data_ts, regression_model, NORM_X=True, NORM_Y=True): """ Run regression model(s) on single replica of data :param data_tr: df, input training dataset, [x,y] labels last column :param data_ts: df, input test dataset, [x,y] labels last column :param regression_model: list of stings, regression method. Options are hard coded here, but can be extracted in a dict in the future :param NORM_X: bool, wheter to normalize input data :param NORM_Y: bool, wheter to normalize output data :returns: dict containing weights, accuracy scores for training and tests, and the time difference between first and last training points """ tr_ = data_tr.copy() ts_ = data_ts.copy() if NORM_X: scalerX = sk.preprocessing.StandardScaler().fit(tr_.iloc[:,:-1]) trn = pd.DataFrame(scalerX.transform(tr_.iloc[:,:-1]), columns=tr_.iloc[:,:-1].columns, index=tr_.index) tst = pd.DataFrame(scalerX.transform(ts_.iloc[:,:-1]), columns=ts_.iloc[:,:-1].columns, index=ts_.index) else: trn = tr_.iloc[:,:-1] tst = ts_.iloc[:,:-1] if NORM_Y: scalerY = sk.preprocessing.StandardScaler().fit(tr_.iloc[:,-1].values.reshape(-1, 1)) y_trn = scalerY.transform(tr_.iloc[:,-1].values.reshape(-1, 1)) y_tst = scalerY.transform(ts_.iloc[:,-1].values.reshape(-1, 1)) else: y_trn = tr_.iloc[:,-1] y_tst = ts_.iloc[:,-1] trn = trn.assign(labels=y_trn) tst = tst.assign(labels=y_tst) if regression_model.lower() == 'ridgereg': # MSE_error = make_scorer(mean_squared_error, greater_is_better=False) # regModel = RidgeCV(alphas=np.logspace(-6,6,13), fit_intercept=not NORM_Y, # normalize=False, store_cv_values=False, gcv_mode='svd', # cv=3, scoring=MSE_error).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) regModel = sk.linear_model.Ridge(alpha=0.1, fit_intercept=not NORM_Y, normalize=False).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) weights = regModel.coef_ elif regression_model.lower() == 'lasso': # regModel = LassoCV(alphas=np.logspace(-3,-1,3), n_alphas=200, # fit_intercept=not NORM_Y, cv=3).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) regModel = sk.linear_model.Lasso(alpha=0.1, fit_intercept=not NORM_Y, normalize=False).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) weights = regModel.coef_ elif regression_model.lower() == 'pls': n = 3 regModel = PLSRegression(n_components=n, scale=False).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) regModel.coef_ = np.squeeze(np.transpose(regModel.coef_)) weights = regModel.coef_ elif regression_model.lower() == 'rf': import sklearn.ensemble regModel = sklearn.ensemble.RandomForestRegressor(n_estimators=100, criterion='mse', max_features = 0.5, max_depth=20, min_samples_split=2, min_samples_leaf=1).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) weights = regModel.feature_importances_ elif regression_model.lower() == 'rbfgpr': kernel = 1.0 * kern.RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)) + \ 1.0 * kern.WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-1, 1e+4)) + \ 1.0 * kern.ConstantKernel(constant_value=1.0, constant_value_bounds=(1e-05, 100000.0)) + \ 1.0 * kern.DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-05, 100000.0)) regModel = GaussianProcessRegressor(kernel=kernel, optimizer='fmin_l_bfgs_b', alpha=0, n_restarts_optimizer=5).fit(trn.iloc[:,:-1], trn.iloc[:,-1]) elif regression_model.lower() == 'rbfgprard': inds_trn = trn.index x = trn.iloc[:,:-1].values y = trn.iloc[:,-1].values.reshape(-1,1) k = (GPy.kern.RBF(x.shape[1], ARD=True) + GPy.kern.White(x.shape[1], 0.01) + GPy.kern.Linear(x.shape[1], variances=0.01, ARD=False)) regModel = GPy.models.GPRegression(x,y,kernel=k) regModel.optimize('bfgs', max_iters=200) # gi(regModel) weights = 50/regModel.sum.rbf.lengthscale else: print('method not implemented yet. Or check the spelling') return [] # TEST STEPS if regression_model.lower() == 'rbfgprard': inds_tst = tst.index x_ = tst.iloc[:,:-1].values y_ts_h = regModel.predict(x_)[0].reshape(-1,) y_ts_h = pd.Series(y_ts_h,index=inds_tst) y_tr_h = regModel.predict(x)[0].reshape(-1,) y_tr_h = pd.Series(y_tr_h,index=inds_trn) else: y_ts_h = regModel.predict(tst.iloc[:,:-1]) y_tr_h = regModel.predict(trn.iloc[:,:-1]) if NORM_Y: y_tr_h = scalerY.inverse_transform(y_tr_h) y_ts_h = scalerY.inverse_transform(y_ts_h) y_tr_gt = scalerY.inverse_transform(trn.iloc[:,-1]) y_ts_gt = scalerY.inverse_transform(tst.iloc[:,-1]) else: y_tr_gt = trn.iloc[:,-1] y_ts_gt = tst.iloc[:,-1] tr_r2 = r2_score(y_tr_gt, y_tr_h) tr_mse = np.sqrt(mean_squared_error(y_tr_gt, y_tr_h)) ts_r2 = r2_score(y_ts_gt, y_ts_h) ts_mse = np.sqrt(mean_squared_error(y_ts_gt, y_ts_h)) if 0: print('trn: MSE %f, R2 %f' %(t_mse,t_r2)) print('%f -- trn: MSE %f, R2 %f' %(bb,t_mse,t_r2)) print('%f -- tst: MSE %f, R2 %f' %(bb,mse,r2)) return {'weights': weights, 'tr_r2': tr_r2, 'ts_r2': ts_r2, 'tr_mse': tr_mse, 'ts_mse': ts_mse}