Пример #1
0
def load_h5(file_name="pyfeat_aus_to_landmarks.h5"):
    """Load the h5 PLS model for plotting.

    Args:
        file_name (str, optional): Specify model to load.. Defaults to 'blue.h5'.

    Returns:
        model: PLS model
    """
    try:
        hf = h5py.File(os.path.join(get_resource_path(), file_name), "r")
        d1 = hf.get("coef")
        d2 = hf.get("x_mean")
        d3 = hf.get("y_mean")
        d4 = hf.get("x_std")
        model = PLSRegression(len(d1))
        model.coef_ = np.array(d1)
        if int(__version__.split(".")[1]) < 24:
            model.x_mean_ = np.array(d2)
            model.y_mean_ = np.array(d3)
            model.x_std_ = np.array(d4)
        else:
            model._x_mean = np.array(d2)
            model._y_mean = np.array(d3)
            model._x_std = np.array(d4)
        hf.close()
    except Exception as e:
        print("Unable to load data ", file_name, ":", e)
    return model
Пример #2
0
 def _load_model(cls, fh):
     params = _parse_literal(fh)
     coef_shape = _parse_literal(fh)
     pls = PLSRegression().set_params(**params)
     pls.x_mean_ = np.fromstring(fh.read(coef_shape[0] * 8))
     pls.y_mean_ = np.fromstring(fh.read(coef_shape[1] * 8))
     pls.x_std_ = np.ones(coef_shape[0])
     pls.y_std_ = np.ones(coef_shape[1])
     n = coef_shape[0] * coef_shape[1] * 8
     pls.coef_ = np.fromstring(fh.read(n)).reshape(coef_shape)
     return pls
Пример #3
0
 def test_same_as_matlab():
     """
     test that the sMC score is equal to those provided from matlab
     """
     data = sklearn.datasets.load_boston()
     X = data['data']
     y = data['target']
     pls = PLSRegression()
     pls.fit(X,y)
     smc_mat = loadmat('./validering/values_smc_1_centered.mat')['values']
     coef = loadmat('./validering/beta_1_centered')['BETA']
     pls.coef_ = coef[1:] # leave the interception out
     smc = sMC()
     smc.fit(pls,X)
     corrects = np.sum(np.round(smc.importances,10) == np.round(smc_mat,10))
     assert (corrects==np.shape(X)[1])
def ajustar_pls_letalidad(municipios_df,
                          caracteristicas,
                          min_casos=20,
                          min_defunciones=0):
    data_train = municipios_df.loc[municipios_df[caracteristicas].notna().all(
        axis=1)]

    X = data_train.query(
        f'(conteo > {min_casos}) & (defunciones > {min_defunciones})'
    )[caracteristicas]
    Y = data_train.query(
        f'(conteo > {min_casos}) & (defunciones > {min_defunciones})'
    )['tasa_covid_letal']

    # X['i_vuln_econo'] = -X['i_vuln_econo']

    pls2 = PLSRegression(n_components=1)
    pls2.fit(X, Y)
    pls2.coef_ = pls2.coef_.flatten()

    return pls2
Пример #5
0
def load_h5(file_name='blue.h5'):
    """Load the h5 PLS model for plotting.

    Args:
        file_name (str, optional): Specify model to load.. Defaults to 'blue.h5'.

    Returns:
        model: PLS model
    """
    try:
        hf = h5py.File(os.path.join(get_resource_path(), file_name), 'r')
        d1 = hf.get('coef')
        d2 = hf.get('x_mean')
        d3 = hf.get('y_mean')
        d4 = hf.get('x_std')
        model = PLSRegression(len(d1))
        model.coef_ = np.array(d1)
        model.x_mean_ = np.array(d2)
        model.y_mean_ = np.array(d3)
        model.x_std_ = np.array(d4)
        hf.close()
    except Exception as e:
        print('Unable to load data ', file_name, ':', e)
    return model
def ajustar_pls_columna(municipios_df,
                        caracteristicas,
                        columna,
                        min_casos=20,
                        min_defunciones=0):
    data_train = municipios_df.loc[municipios_df[caracteristicas].notna().all(
        axis=1)]

    X = data_train.query(
        f'(conteo > {min_casos}) & (defunciones > {min_defunciones})'
    )[caracteristicas]
    try:
        Y = data_train.query(
            f'(conteo > {min_casos}) & (defunciones > {min_defunciones})'
        )[columna]
    except KeyError:
        print(f"No existe la columna {columna}")
    # X['i_vuln_econo'] = -X['i_vuln_econo']

    pls2 = PLSRegression(n_components=1)
    pls2.fit(X, Y)
    pls2.coef_ = pls2.coef_.flatten()

    return pls2
def run_regression_simple_data(data_tr, data_ts, regression_model, NORM_X=True, NORM_Y=True):
    """
        Run regression model(s) on single replica of data

        :param data_tr: df, input training dataset, [x,y] labels last column
        :param data_ts: df, input test dataset, [x,y] labels last column
        :param regression_model: list of stings, regression method. Options are hard coded here, but can be extracted in a dict in the future
        :param NORM_X: bool, wheter to normalize input data
        :param NORM_Y: bool, wheter to normalize output data
        :returns: dict containing weights, accuracy scores for training and tests, and the time difference between first and last training points

    """

    tr_ = data_tr.copy()
    ts_ = data_ts.copy()

    if NORM_X:
        scalerX = sk.preprocessing.StandardScaler().fit(tr_.iloc[:,:-1])
        trn = pd.DataFrame(scalerX.transform(tr_.iloc[:,:-1]), columns=tr_.iloc[:,:-1].columns,
                            index=tr_.index)
        tst = pd.DataFrame(scalerX.transform(ts_.iloc[:,:-1]), columns=ts_.iloc[:,:-1].columns, index=ts_.index)
    else:
        trn = tr_.iloc[:,:-1]
        tst = ts_.iloc[:,:-1]

    if NORM_Y:
        scalerY = sk.preprocessing.StandardScaler().fit(tr_.iloc[:,-1].values.reshape(-1, 1))
        y_trn = scalerY.transform(tr_.iloc[:,-1].values.reshape(-1, 1))
        y_tst = scalerY.transform(ts_.iloc[:,-1].values.reshape(-1, 1))
    else:
        y_trn = tr_.iloc[:,-1]
        y_tst = ts_.iloc[:,-1]

    trn = trn.assign(labels=y_trn)
    tst = tst.assign(labels=y_tst)

    if regression_model.lower() == 'ridgereg':
    #                 MSE_error = make_scorer(mean_squared_error, greater_is_better=False)
    #                 regModel = RidgeCV(alphas=np.logspace(-6,6,13), fit_intercept=not NORM_Y,
    #                            normalize=False, store_cv_values=False, gcv_mode='svd',
    #                            cv=3, scoring=MSE_error).fit(trn.iloc[:,:-1], trn.iloc[:,-1])
        regModel = sk.linear_model.Ridge(alpha=0.1, fit_intercept=not NORM_Y,
                   normalize=False).fit(trn.iloc[:,:-1], trn.iloc[:,-1])
        weights = regModel.coef_

    elif regression_model.lower() == 'lasso':
    #                 regModel = LassoCV(alphas=np.logspace(-3,-1,3), n_alphas=200,
    #                                     fit_intercept=not NORM_Y, cv=3).fit(trn.iloc[:,:-1], trn.iloc[:,-1])
        regModel = sk.linear_model.Lasso(alpha=0.1, fit_intercept=not NORM_Y,
                    normalize=False).fit(trn.iloc[:,:-1], trn.iloc[:,-1])
        weights = regModel.coef_

    elif regression_model.lower() == 'pls':
        n = 3
        regModel = PLSRegression(n_components=n, scale=False).fit(trn.iloc[:,:-1], trn.iloc[:,-1])
        regModel.coef_ = np.squeeze(np.transpose(regModel.coef_))
        weights = regModel.coef_

    elif regression_model.lower() == 'rf':
        import sklearn.ensemble
        regModel = sklearn.ensemble.RandomForestRegressor(n_estimators=100, criterion='mse',
                max_features = 0.5, max_depth=20, min_samples_split=2,
                min_samples_leaf=1).fit(trn.iloc[:,:-1], trn.iloc[:,-1])
        weights = regModel.feature_importances_

    elif regression_model.lower() == 'rbfgpr':
        kernel = 1.0 * kern.RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)) + \
        1.0 * kern.WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-1, 1e+4)) + \
        1.0 * kern.ConstantKernel(constant_value=1.0, constant_value_bounds=(1e-05, 100000.0)) + \
        1.0 * kern.DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-05, 100000.0))

        regModel = GaussianProcessRegressor(kernel=kernel, optimizer='fmin_l_bfgs_b',
                        alpha=0, n_restarts_optimizer=5).fit(trn.iloc[:,:-1], trn.iloc[:,-1])

    elif regression_model.lower() == 'rbfgprard':

        inds_trn = trn.index
        x = trn.iloc[:,:-1].values
        y = trn.iloc[:,-1].values.reshape(-1,1)

        k = (GPy.kern.RBF(x.shape[1], ARD=True)
             + GPy.kern.White(x.shape[1], 0.01)
             + GPy.kern.Linear(x.shape[1], variances=0.01, ARD=False))

        regModel = GPy.models.GPRegression(x,y,kernel=k)
        regModel.optimize('bfgs', max_iters=200)
    #                 gi(regModel)
        weights = 50/regModel.sum.rbf.lengthscale

    else:
        print('method not implemented yet. Or check the spelling')
        return []

    # TEST STEPS
    if regression_model.lower() == 'rbfgprard':
        inds_tst = tst.index
        x_ = tst.iloc[:,:-1].values
        y_ts_h = regModel.predict(x_)[0].reshape(-1,)
        y_ts_h = pd.Series(y_ts_h,index=inds_tst)
        y_tr_h = regModel.predict(x)[0].reshape(-1,)
        y_tr_h = pd.Series(y_tr_h,index=inds_trn)
    else:
        y_ts_h = regModel.predict(tst.iloc[:,:-1])
        y_tr_h = regModel.predict(trn.iloc[:,:-1])

    if NORM_Y:
        y_tr_h = scalerY.inverse_transform(y_tr_h)
        y_ts_h = scalerY.inverse_transform(y_ts_h)
        y_tr_gt = scalerY.inverse_transform(trn.iloc[:,-1])
        y_ts_gt = scalerY.inverse_transform(tst.iloc[:,-1])
    else:
        y_tr_gt = trn.iloc[:,-1]
        y_ts_gt = tst.iloc[:,-1]

    tr_r2 = r2_score(y_tr_gt, y_tr_h)
    tr_mse = np.sqrt(mean_squared_error(y_tr_gt, y_tr_h))
    ts_r2 = r2_score(y_ts_gt, y_ts_h)
    ts_mse = np.sqrt(mean_squared_error(y_ts_gt, y_ts_h))

    if 0:
        print('trn: MSE %f, R2 %f' %(t_mse,t_r2))
        print('%f -- trn: MSE %f, R2 %f' %(bb,t_mse,t_r2))
        print('%f -- tst: MSE %f, R2 %f' %(bb,mse,r2))

    return {'weights': weights, 'tr_r2': tr_r2,
                'ts_r2': ts_r2, 'tr_mse': tr_mse, 'ts_mse': ts_mse}