示例#1
0
def test_linearity(x, y, n_knots=5, verbose=True):
    """Test linearity between two variables.

    Run a linear regression of y on x, and take the residuals.
    Fit the residuals with a natural spline with `n_knots` knots.
    Conduct a joint F-test for all columns in the natural spline basis matrix.

    Example:
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> x = np.linspace(0., 1., 101)
    >>> y = 5 * x + 3 + rng.random(size=101) / 5
    >>> test_linearity(x, y, n_knots=5, verbose=False)
    0.194032
    """
    residuals = OLS(y, add_constant(x)).fit().resid
    basis_matrix = patsy.dmatrix(
        f"cr(x, df={n_knots - 1}, constraints='center') - 1", {'x': x},
        return_type='dataframe')
    results = OLS(residuals, basis_matrix).fit()
    results.summary()
    nobs = results.nobs
    f_value = results.fvalue
    p_value = np.round(results.f_pvalue, 6)
    print('Test for Linearity: '
          f'N = {nobs:.0f}; df={nobs - n_knots - 1:.0f}; '
          f'F = {f_value:.3f}; p = {p_value:.6f}.')
    return p_value
示例#2
0
    def _fit_backward(self):

        y_train = pd.Series(self._model.model.endog.copy(),
                            name=self.dependent_variable,
                            index=self._observations_idx)
        X_train = pd.DataFrame(self._model.model.exog,
                               columns=self._model.model.exog_names,
                               index=self._observations_idx)

        model = OLS(y_train, X_train, missing='drop')

        results = model.fit()

        max_pvalue = results.pvalues.drop('Intercept').max()

        while max_pvalue > self.sig_level_removal:
            x_to_drop = results.pvalues.drop('Intercept').idxmax()
            X_train = X_train.drop(x_to_drop, axis=1)
            model = OLS(y_train, X_train, missing='drop')
            results = model.fit()
            max_pvalue = results.pvalues.drop('Intercept').max()

        self._model = results

        return
示例#3
0
def get_cointLst(corrList, df_is):
    # called in main
    # Test cointegration the test has to be perform on both side of the spread
    cointLst = []
    for pair in corrList:
        X1, X2 = df_is[pair[0]].values, df_is[pair[1]].values

        x1 = add_constant(X1)
        x2 = add_constant(X2)
        r1 = OLS(X2, x1).fit()
        r2 = OLS(X1, x2).fit()

        adf1 = adfuller(r1.resid)[1]
        if adf1 < 0.01:
            adf2 = adfuller(r2.resid)[1]
            if adf2 < 0.01 and adf1 < adf2:  # Test for strong cointegration in both side only.
                cointLst.append(["{0}_{1}".format(pair[0], pair[1])] + pair +
                                [adf1] + list(r1.params))
            elif adf2 < 0.01:
                cointLst.append(["{0}_{1}".format(pair[1], pair[0])] +
                                [pair[1], pair[0], pair[2], pair[3], adf2] +
                                list(r2.params))

    #print "There are {0} pairs strongly cointegrated.".format(len(cointLst))
    return cointLst
def backwardElimination(x, SL):
    numVars = len(x[0])
    temp = np.zeros((50, 6)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = OLS(y, x).fit()
        print(regressor_OLS.summary())
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:, j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = OLS(y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:, [0, j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print(regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
        else:
            break
    return x
示例#5
0
def stepwise_selection(data, target, SL_in=0.05, SL_out=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features) > 0):
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = OLS(target,
                        sm.add_constant(data[best_features +
                                             [new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if (min_p_value < SL_in):
            best_features.append(new_pval.idxmin())
            while (len(best_features) > 0):
                best_features_with_constant = sm.add_constant(
                    data[best_features])
                p_values = OLS(target,
                               best_features_with_constant).fit().pvalues[1:]
                max_p_value = p_values.max()
                if (max_p_value >= SL_out):
                    excluded_feature = p_values.idxmax()
                    best_features.remove(excluded_feature)
                else:
                    break
        else:
            break
    return best_features
示例#6
0
def find_apex(decel):
    res = []
    for t in decel.index[10::10]:
        left = decel[:t]['accelY']
        right = decel[t:]['accelY']
        left_mod = OLS(left, add_constant(range(len(left)))).fit()
        right_mod = OLS(right, add_constant(range(len(right)))).fit()
        ssrs = [t, left_mod.ssr, right_mod.ssr]
        res.append(ssrs)
    apex = min(res, key=lambda x: x[1] + x[2])[0]
    return apex
def calc_gwi(obs,obs_years,reg_type='mon',base_low=1850.,base_high=1900, name=''):
    
    #Express the observations relative to the base period 
    obs = obs - np.mean(obs[np.logical_and(obs_years>=base_low,obs_years<(base_high+1))])

    #Load the best estimate forcings from Piers
    forc_file = './Data/Annualforcings_Mar2014_GHGrevised.txt'
    data = np.genfromtxt(forc_file,skip_header=4)
    years = data[:,0]
    tot_forc = data[:,13]
    ant_forc = data[:,14]
    
    #Integrate anthropogenic and natural forcing with standard FAIR parameters
    C, t_nat = fair_scm(other_rf=tot_forc-ant_forc)
    C, t_anthro = fair_scm(other_rf=ant_forc)
    #Express relative to the centre of the base period
    t_nat = t_nat - np.mean(t_nat[np.logical_and(years>=base_low,years<base_high+1)])
    t_anthro = t_anthro - np.mean(t_anthro[np.logical_and(years>=base_low,years<base_high+1)])
    # -----------------------------------------------
    
    
    # Prepare the temperatures run through FaIR, so they lie on same year-grid as observations, so they can be compared
    # -----------------------------------------------
    #Interpolate the annual forced responses to the grid of the observed data
    if reg_type !='mon':
        t_nat = np.interp(obs_years+0.5, years+0.5, t_nat)
        t_anthro = np.interp(obs_years+0.5, years+0.5, t_anthro)
    else:
        t_nat = np.interp(obs_years, years+0.5, t_nat)
        t_anthro = np.interp(obs_years, years+0.5, t_anthro)

    #Linearly project the final half year
    t_anthro[obs_years>(years[-1]+0.5)] = 12*(t_anthro[obs_years<=(years[-1]+0.5)][-1] - t_anthro[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \
    +t_anthro[obs_years<=(years[-1]+0.5)][-1]
    t_nat[obs_years>(years[-1]+0.5)] = 12*(t_nat[obs_years<=(years[-1]+0.5)][-1] - t_nat[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \
    +t_nat[obs_years<=(years[-1]+0.5)][-1]
    # -----------------------------------------------
    
    #Use scipy defined OLS regression function to complete OLD regression of observations data on natural and anthropogenic warming with a constant
    y = np.copy(obs)
    x = DataFrame({'x1': (t_anthro), 'x2': (t_nat)})
    # add constant vector on to dataframe we will fit to temp observations
    x = statsmodels.tools.tools.add_constant(x)
    # complete OLS regression of anthropogenic and natural temperatures (found from FaIR integrated best estimate forcing) onto given observed temperature dataset.
    model = OLS(y, x)
    result = model.fit()
    # collect output scaling factors for anthro and natural temperature timeseries
    sf = result.params

    #Form scaled anthropgenic warming index
    awi = t_anthro * sf['x1']
    #Scaled natural warming index
    nwi = t_nat * sf['x2']
    #Scaled total externally forced warming index
    gwi = awi + nwi
    
    print(name, ' AWI scale factor: ', sf['x1'], '\n', name, ' NWI scale factor: ', sf['x2'])

    
    return awi, nwi
示例#8
0
def factor_alpha_beta(
        factor_data: pd.DataFrame,
        returns: pd.DataFrame = None,
        demeaned: bool = True,
        group_adjust: bool = False,
        equal_weight: bool = False,
):
    """
    计算因子的 alpha (超额收益), alpha 的 t-统计量 以及 beta 值

    参数
    ---
    :param factor_data: 索引为 ['日期' '股票'] 的 MultiIndex, values 包括因子值,远期收益,因子分位,因子分组 [可选]
    :param returns: 因子远期收益,默认为 None, 如果为 None 的时候,会通过调用 `factor_returns` 来计算相应的收益
    :param demeaned: 是否基于一个多空组合
    :param group_adjust: 是否进行行业中性处理
    :param equal_weight:

    返回
    ---
    """
    if returns is None:
        returns = factor_returns(
            factor_data,
            demeaned,
            group_adjust,
            equal_weight
        )

    universe_ret = (
        factor_data.groupby(level="datetime")[get_forward_returns_columns(
            factor_data.columns
        )].mean().loc[returns.index]
    )

    if isinstance(returns, pd.Series):
        returns.name = universe_ret.columns.values[0]
        returns = pd.DataFrame(returns)

    alpha_beta = pd.DataFrame()
    for period in returns.columns.values:
        x = universe_ret[period].values
        y = returns[period].values
        x = add_constant(x)

        reg_fit = OLS(y, x).fit()
        try:
            alpha, beta = reg_fit.params
        except ValueError:
            alpha_beta.loc["Ann. alpha", period] = np.nan
            alpha_beta.loc["beta", period] = np.nan
        else:
            freq_adjust = pd.Timedelta(days=DAYS_PER_YEAR) / pd.Timedelta(
                utils.get_period(period.replace("period_",
                                                ""))
            )
            alpha_beta.loc["Ann. alpha",
                           period] = (1 + alpha)**freq_adjust - 1.0
            alpha_beta.loc["beta", period] = beta
    return alpha_beta
示例#9
0
    def remove_outliers(train, targetField, dropVal, studentResid, verbose=True):
        """
        Remove outliers from training data based on statsmodels OLS Fit studentized residuals and specified drop values across features

        :param pandas.DataFrame train: data for training
        :param str targetField: target from train/ test :py:class:`pandas.DataFrame`
        :param obj dropVal: value to drop rows across
        :param float studentResid: number to threshold absolute value of student residuals above
        :param bool verbose: flag to print out OLS summary information and number of outlier removed
        """

        train = train.dropna()
        if dropVal is not None:
            train = train.ix[(train.T != dropVal).all()]

        design = train[[i for i in train if i != targetField]]
        target = train[targetField]

        design = StandardScaler().fit_transform(design)
        model = OLS(target, design)
        mask = np.ones((train.shape[0])).astype(bool)
        if studentResid is not None:
            mask = (model.fit().outlier_test()['student_resid'].abs() < 2)

        if verbose:
            print model.fit().summary()
            print 'Removed:' + str(train.shape[0] - sum(mask))

        return train.ix[mask]
示例#10
0
 def run_acc_compare(self, print_summary=False, data_df=None):
     #if regressiondict is None:
     #    regressiondict=self.modeldict['regressiondict']
     if data_df is None:
         self.set_flat_c_stats_df()
         data_df = self.flat_c_stats_df
     data_df.dropna(inplace=True, axis=0)
     y_df = data_df.loc[:, 'accuracy']
     X_df = data_df.drop(labels='accuracy', axis=1, inplace=False)
     #print('y_df',y_df)
     #print('X_df',X_df)
     X_dtypes_ = dict(X_df.dtypes)
     obj_vars = [
         var for var, dtype in X_dtypes_.items() if dtype == 'object'
     ]
     #float_idx=[i for i in range(X_df.shape[1]) if i not in obj_idx]
     #self.model=regressiondict['pipeline'](cat_idx=obj_idx,float_idx=float_idx)
     X_float_df = self.floatify_df(X_df, obj_vars)
     #X_float_df=add_constant(X_float_df)
     self.X_float_df = X_float_df
     self.y_df = y_df
     self.model = OLS(y_df, X_float_df)
     self.model_result = self.model.fit()
     if print_summary:
         print('OLS results for modeldict:')
         print(self.modeldict)
         print(self.model_result.summary())
示例#11
0
def prosperity_score_regression(cards,
                                metadata,
                                score_columns=score_column_names):
    """
    Perform a linear regression to determine the degree to which the
    Prosperity add-on treasure and victory cards contribute to a good
    score.
    """
    prosperity = set(cards['currency'].columns.get_level_values(1))
    # victory_cards = set(cards['victory'].columns.get_level_values(1))
    # cards = currency_cards.union(victory_cards)
    scores = np.mean(metadata.loc[:, tuple(score_columns)], axis=1)

    # Ignore missing cells
    refine_idx = np.isfinite(scores)
    scores = scores[refine_idx]

    set_counts = pd.concat([
        pd.DataFrame(cards.loc[refine_idx, pd.IndexSlice[:, :, c]].values,
                     columns=[c]) for c in prosperity
    ] + [
        pd.DataFrame(np.ones((scores.size, 1)), columns=['Average game score'])
    ],
                           axis=1).fillna(0)

    results = OLS(scores, set_counts).fit()
    print results.summary()
def linear_regression(data):
    """
    goal of this function :
        - to apply a linear regression ; ie. to calculate the coefficient and
        the intercept value of the regression line
    input parameter :
        - json file's content (data)
    output :
        - dict containing the coefficient value and intercept for each word
    cmd packages :
        - numpy (ones, arange)
        - statsmodels.api (ols)
    """

    #initialisation
    dict_linreg = {}

    #for each entry in the json file (data)
    #intercept value and coefficient calculation
    for k, v in data.items():
        mat_x = np.ones((len(v), 2))
        mat_x[:, 1] = np.arange(0, len(v))

        reg = OLS(v, mat_x)
        results = reg.fit()

        dict_linreg[k] = [results.params[1], results.params[0]]

    return (dict_linreg)
示例#13
0
    def residual_k(self):
        """
        Residual series of the K factors

        Returns:
            np.array, shape=(k Factors, T periods)

        """

        from statsmodels.api import OLS
        res = []
        idx_lst = list(range(self.K))
        for i in range(self.K):
            x_idx, y_idx = [*idx_lst[0:i], *idx_lst[i + 1:]], idx_lst[i:i + 1]
            x_data, y_data = self.mtx_factors[:,
                                              x_idx], self.mtx_factors[:,
                                                                       y_idx]
            # from sklearn.linear_model import LinearRegression
            # model = LinearRegression(fit_intercept=True).fit(x_data, y_data)

            # intercept is not added by default in OLS implement
            model = OLS(y_data, x_data).fit()
            res.append(model.resid)

        return np.array(res)
示例#14
0
def nuevo_regress():
    modelo = OLS(DATASET.puntaje_global, DATASET.puntaje_matematicas).fit()
    summary = modelo.summary()
    vals_residuales = modelo.resid
    print(summary)
    print(anderson(vals_residuales))
    grafica_qq(vals_residuales)
def optimal_spreads_regression(cov_matrix, mid, market_rel_spread):
    regressors = 3*pd.DataFrame([np.diag(cov_matrix)], ['Variance'], mid.index).T
    regressors['Inverse decay'] = 1
    fit = OLS(market_rel_spread*mid, regressors).fit()
    risk_aversion = fit.params['Variance']
    intensity_decay = 2/fit.params['Inverse decay']
    return risk_aversion, intensity_decay, fit.rsquared
示例#16
0
def _compute_vif(exog, exog_idx, weights=None, model_config=None):
    """
    Compute variance inflation factor, VIF, for one exogenous variable
    for OLS and WLS that allows weights.
    Parameters
    ----------
    exog: X features [X_1, X_2, ..., X_n]
    exog_idx: ith index for features
    weights: weights
    model_config: {"hasconst": True,
    "cov_type": "HC3"} by default
    
    Returns: vif
    -------
    """
    if model_config is None:
        model_config = {"hasconst": True,
                        "cov_type": "HC3"}
    k_vars = exog.shape[1]
    x_i = exog[:, exog_idx]
    mask = np.arange(k_vars) != exog_idx
    x_noti = exog[:, mask]
    if weights is None:
        r_squared_i = OLS(x_i,
                          x_noti,
                          hasconst=model_config["hasconst"]).fit().rsquared
    else:
        r_squared_i = WLS(x_i,
                          x_noti,
                          hasconst=model_config["hasconst"],
                          weights=weights).fit(
            cov_type=model_config["cov_type"]).rsquared
    vif = 1. / (1. - r_squared_i)
    return vif
示例#17
0
    def alpha_beta(self):
        rr = (self.X - 1).mean(1)

        m = OLS(self.r - 1, np.vstack([np.ones(len(self.r)), rr]).T)
        reg = m.fit()
        alpha, beta = reg.params.const * 252, reg.params.x1
        return alpha, beta
示例#18
0
def capm(y: pd.Series, bases: pd.DataFrame, rf=0.0, fee=0.0):
    freq = _freq(y.index)
    rf = rf / freq
    fee = fee / freq
    R = y.pct_change() - rf
    R.name = y.name
    R_base = bases.pct_change().sub(rf, axis=0)

    # CAPM:
    # R = alpha + rf + beta * (Rm - rf)
    model = OLS(R, R_base.assign(Intercept=1), missing="drop").fit()

    alpha = model.params["Intercept"] * freq
    betas = model.params[bases.columns]

    # reconstruct artificial portfolio
    proxy = R_base @ betas + (1 - betas.sum()) * (rf + fee)
    cumproxy = (1 + proxy).cumprod()

    # residual portfolio
    r = y.pct_change() - cumproxy.pct_change()
    residual = (1 + r).cumprod()

    return {
        "alpha": alpha,
        "betas": betas,
        "cumproxy": cumproxy,
        "model": model,
        "residual": residual,
    }
示例#19
0
    def _capm_mu(self, asset, markets, mu, sigma, X):
        """Calculate mean estimated by CAPM."""
        freq = tools.freq(X.index)
        X = X[[asset] + markets].dropna()
        res = OLS(X[asset] - 1 - self.rfr / freq, add_constant(X[markets] - 1 - self.rfr / freq)).fit()

        beta = res.params.drop(['const'])

        prev_mu = mu[asset]
        new_mu = self.rfr + (mu[markets] - self.rfr).dot(beta)

        alpha = res.params.const * freq
        alpha_std = freq * np.sqrt(res.cov_params().loc['const', 'const'])

        if self.verbose:
            print(f'Beta of {[x for x in beta.round(2)]} changed {asset} mean return from {prev_mu:.1%} to {new_mu:.1%} with alpha {alpha:.2%} ({alpha_std:.2%})')

        # be benevolent and add alpha if it is positive
        # k = 0.2 was fine tuned on DPST in order to get it out of the portfolio
        k = 0.2
        if alpha - k * alpha_std > 0 and asset in ('KRE', 'DPST'):
            if self.verbose:
                print(f'   Adding alpha of {alpha - k * alpha_std:.2%} for {asset}')
            new_mu += alpha - k * alpha_std
        return new_mu
示例#20
0
def RL_LR_correlation(sessions, fig_no=1):
    '''Correlate the effect of stimulation on the transition predictor
    with RL model paramters across subjects'''
    # Fit RL model to all trials.
    RL_agent = rl.MFmoMF_MB_dec(['bs','rb','ec','mc'])
    RL_fit = mf.fit_population(sessions, RL_agent)
    # Fit regression model seperately to stim and non-stim trial.
    LR_model = lr.config_log_reg()
    LR_model.trial_select['trial_mask'] = 'stim_trials'
    LR_model.trial_select['invert_mask'] = False
    LR_fit_stim = mf.fit_population(sessions, LR_model)
    LR_model.trial_select['invert_mask'] = True
    LR_fit_nons = mf.fit_population(sessions, LR_model)
    # Make data frame with parameter fits for each subject.
    ses_LR_params_stim =  np.vstack([sf['params_T'] for sf in LR_fit_stim['session_fits']])
    ses_LR_params_nons =  np.vstack([sf['params_T'] for sf in LR_fit_nons['session_fits']])
    ses_RL_params = np.vstack([sf['params_T'] for sf in RL_fit['session_fits']])
    ses_df = pd.DataFrame({pn: ses_RL_params[:,i] for i,pn in enumerate(RL_agent.param_names)})
    ses_df['d_trans'] = (ses_LR_params_stim[:,LR_model.param_names.index('trans_CR')] -
                         ses_LR_params_nons[:,LR_model.param_names.index('trans_CR')])
    ses_df['subject'] = np.array([s.subject_ID for s in sessions])
    sub_df = ses_df.groupby('subject').mean()
    # Plot correlation of G_mb with stim effect on transition predictor.
    plt.figure(fig_no, clear=True, figsize=[3.3,3])
    regplot('G_mb', 'd_trans', sub_df)
    plt.xlabel('Model-based weight')
    plt.ylabel('Stim change in\ntransition predictor')
    plt.tight_layout()
    res = linregress(sub_df['G_mb'], sub_df['d_trans'])
    print('Slope: {:.3f} r: {:.3f} P value: {:.4f}'.format(
        res.slope, res.rvalue, res.pvalue))
    # Regress stim effect with multiple RL model parameters.
    X = sub_df[['G_mb','G_td','G_tdm','mc']]
    X.insert(0,'const',1)
    print(OLS(sub_df['d_trans'], X).fit().summary())
示例#21
0
 def prs_betaci(q, prs, df):
     (q0,q1)=q
     we_print=(q0==2)
     q0=df[prs].quantile((100-q0)/100.0),  # pandas has 99 as the highest; we have 1 as the highest
     q1=df[prs].quantile((100-q1)/100.0)
     q40=df[prs].quantile(0.4)
     q60=df[prs].quantile(0.6)   
     iids=df.index[((q0 <= df[prs]) & (df[prs] <= q1)) | ((q40 <= df[prs]) & (df[prs] <= q60))]
     if is_bin:
         data=np.vstack((expit(models['PRS']['COVAR']['train'].predict(df.loc[iids,covariates])), 
                         (q0 <= df.loc[iids,prs]) & (df.loc[iids,prs] <= q1))).T
         try:
             m=Logit(df.loc[iids,phe_code], data).fit(disp=0)
         except PerfectSeparationError:
             return None,(None,None),None 
         b=np.exp(m.params[1])
         ci=np.abs(np.exp(m.conf_int().iloc[1,:].values)-b)
     else:
         data=np.vstack((models['PRS']['COVAR']['train'].predict(df.loc[iids,covariates]), 
                         (q0 <= df.loc[iids,prs]) & (df.loc[iids,prs] <= q1))).T
         m=OLS(df.loc[iids,phe_code], data).fit(disp=0)
         b=m.params[1]
         ci=np.abs(m.conf_int().iloc[1,:].values-b)
     if we_print:
         print(b, [b-ci[0],b+ci[1]])
     return b,ci,df.loc[(q0 <= df[prs]) & (df[prs] <= q1),phe_code].mean()
示例#22
0
def FamaMacbeth_statsmodels(ff3, returns, plot_return=False):
    # First stage: N-time-series regression, one for each asset or portfolio, of its excess returns on the ff3 to estimate the factor loadings
    betas = []
    for equity in returns:
        beta = OLS(endog=returns.loc[returns.index, equity],
                   exog=add_constant(ff3),
                   missing='drop').fit()
        betas.append(beta.params.drop('const'))
    betas = pd.DataFrame(betas, columns=ff3.columns, index=returns.columns)
    # Second stage: T cross-sectional regression, one for each time period, to estimate the risk premium
    lambdas = list()
    for period in returns.index:
        lmda = OLS(endog=returns.loc[period, betas.index],
                   exog=betas,
                   missing='drop').fit()
        lambdas.append(lmda.params)
    return betas, lambdas
示例#23
0
 def testPow(n):
     raw_X = trainData.OverallQual.values.reshape(-1, 1)
     OLS_y = trainData.SalePrice
     X = raw_X**n
     features = sm.add_constant(X)
     ols_sm = OLS(OLS_y.values, features)
     model = ols_sm.fit()
     return model.rsquared
示例#24
0
def run():
    varsY = [
        x for x in Y.columns.tolist()
        if Y.columns.tolist().index(x) in listboxY.curselection()
    ]
    varsX = [
        x for x in X.columns.tolist()
        if X.columns.tolist().index(x) in listboxX.curselection()
    ]
    global trainY
    global trainX
    trainY = Y[~data.isnull().T.any().T]
    trainX = X[~data.isnull().T.any().T]
    trainX = add_constant(trainX[varsX])
    testX = X[data.isnull().T.any().T]
    testX = add_constant(testX[varsX])
    result0 = DataFrame(columns=varsY)
    if (len(varsY) == 0):
        messagebox.showinfo('提示', '至少选中一个结果变量!')
        return
    if (len(varsX) == 0):
        messagebox.showinfo('提示', '至少选中一个预测变量!')
        return
    with ExcelWriter(saveFile, engine="openpyxl") as writer:
        for id, varY in enumerate(varsY):
            fit = OLS(trainY.iloc[:, id], trainX).fit()
            print(fit.summary2().tables)
            result0[varY] = fit.predict(testX)
            result0.to_excel(writer,
                             sheet_name="SUMMARY",
                             header=True,
                             index=True)
            global result1
            result1 = fit.get_prediction(testX).summary_frame()
            result1.to_excel(writer, sheet_name=varY, header=True, index=True)
            global result2
            result2 = fit.summary2().tables
            result2[0].iloc[:, [0, 1]].to_excel(writer,
                                                sheet_name=varY,
                                                header=False,
                                                index=False,
                                                startrow=result1.shape[0] + 2,
                                                startcol=0)
            result2[0].iloc[:, [2, 3]].to_excel(writer,
                                                sheet_name=varY,
                                                header=False,
                                                index=False,
                                                startrow=result1.shape[0] + 2,
                                                startcol=5)
            result2[1].to_excel(writer,
                                sheet_name=varY,
                                header=True,
                                index=True,
                                startrow=result1.shape[0] +
                                result2[0].shape[0] + 3)
    writer.save()
    writer.close()
    messagebox.showinfo('提示', '执行完成!')
示例#25
0
 def fit(self, x, y):
     x = array(x).reshape(-1, 1)
     model = OLS(y, PolynomialFeatures(2).fit_transform(x)).fit()
     self.m = model.predict(
         PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))
     self.s = wls_prediction_std(
         model,
         PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))[0]
     return self
示例#26
0
def get_half_life(Z):
    z_lag = np.roll(Z, 1)
    z_lag[0] = 0
    z_ret = Z - z_lag

    # adds intercept terms to X for regression
    z_lag2 = add_constant(z_lag)
    model = OLS(z_ret, z_lag2).fit()

    return int(-np.log(2) / model.params[1])
示例#27
0
 def _capm(self):
     rfr = self.rf_rate / self.freq()
     rr = self.ucrp_r - rfr
     if 'CASH' in self.B.columns:
         cash = self.B.CASH
     else:
         cash = 0
     m = OLS(self.r - 1 - (1 - cash) * rfr,
             np.vstack([np.ones(len(self.r)), rr - 1]).T)
     return m.fit()
示例#28
0
def intermediate():
    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    data = utils.remove_nulls(data, errors='ignore')
    y = data.pop(dep_var['name'])

    featurizer = _create_featurizer(indep_vars)
    X = pd.DataFrame(featurizer.transform(data),
                     columns=featurizer.columns,
                     index=data.index)

    if not indep_vars:
        raise errors.UserError('No covariables selected.')

    # Distributed linear regression only works for continuous variables
    if utils.is_nominal(dep_var):
        raise errors.UserError(
            'Dependent variable must be continuous in distributed mode. Use SGD Regression for '
            'nominal variables instead.')

    if data.empty:
        logging.warning('All values are NAN, returning zero values')
        result = {
            'summary': {},
            'columns': [],
            'means': 0,
            'X^T * X': 0,
            'count': 0,
            'scale': 0,
        }

    else:
        # Compute linear-regression
        X.insert(loc=0, column='intercept', value=1.)
        lm = OLS(y, X)
        flm = lm.fit()
        logging.info(flm.summary())
        output = format_output(flm)

        result = {
            'summary': output,
            'columns': list(X.columns),
            'means': X.mean().values,
            'X^T * X': X.T.values.dot(X.values),
            'count': len(X),
            'scale': flm.scale,
        }

    # Store results
    io_helper.save_results(json.dumps(result), 'application/json')
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        print(regressor_OLS.summary())
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    return x
示例#30
0
 def stats_models(self, X_train, y_train, show_summary=False):
     '''
     perform OLS from stats model 
     return model results
     '''
     X = sm.add_constant(X_train)
     model_stats = OLS(y_train, X)
     results_stats = model_stats.fit()
     if show_summary:
         results_stats.summary()
     return results_stats