예제 #1
0
    def _get_drop_vari(self, attributes):
        '''regress endog on exog without one of the variables

        This uses a k_vars loop, only attributes of the OLS instance are stored.

        Parameters
        ----------
        attributes : list of strings
           These are the names of the attributes of the auxiliary OLS results
           instance that are stored and returned.

        not yet used
        '''
        from statsmodels.sandbox.tools.cross_val import LeaveOneOut

        endog = self.results.model.endog
        exog = self.exog

        cv_iter = LeaveOneOut(self.k_vars)
        res_loo = defaultdict(list)
        for inidx, outidx in cv_iter:
            for att in attributes:
                res_i = self.model_class(endog, exog[:,inidx]).fit()
                res_loo[att].append(getattr(res_i, att))

        return res_loo
예제 #2
0
    def _res_looo(self):
        '''collect required results from the LOOO loop

        all results will be attached.
        currently only 'params', 'mse_resid', 'det_cov_params' are stored

        regresses endog on exog dropping one observation at a time

        this uses a nobs loop, only attributes of the OLS instance are stored.
        '''
        from statsmodels.sandbox.tools.cross_val import LeaveOneOut
        get_det_cov_params = lambda res: np.linalg.det(res.cov_params())

        endog = self.endog
        exog = self.exog

        params = np.zeros(exog.shape, dtype=np.float)
        mse_resid = np.zeros(endog.shape, dtype=np.float)
        det_cov_params = np.zeros(endog.shape, dtype=np.float)

        cv_iter = LeaveOneOut(self.nobs)
        for inidx, outidx in cv_iter:
            res_i = self.model_class(endog[inidx], exog[inidx]).fit()
            params[outidx] = res_i.params
            mse_resid[outidx] = res_i.mse_resid
            det_cov_params[outidx] = get_det_cov_params(res_i)

        return dict(params=params, mse_resid=mse_resid,
                       det_cov_params=det_cov_params)
예제 #3
0
    def loo(self, **kwargs):
        """ Leave one out cross-validation.

        Calculates summary statistics for each iteraction of
        leave one out cross-validation, specially `mse` on entire model
        and `pred_err` to measure prediction error.

        Parameters
        ----------
        **kwargs : dict
           Keyword arguments used to tune the parameter estimation.

        Returns
        -------
        pd.DataFrame
           model_mse : np.array, float
               Mean sum of squares error for each iteration of
               the cross validation.
           pred_mse : np.array, float
               Prediction mean sum of squares error for each iteration of
               the cross validation.

        See Also
        --------
        fit
        statsmodels.regression.linear_model.
        """
        # number of observations (i.e. samples)
        nobs = self.response_matrix.shape[0]
        cv_iter = LeaveOneOut(nobs)

        results = pd.DataFrame(index=self.response_matrix.index,
                               columns=['model_mse', 'pred_mse'],
                               dtype=np.float64)

        for i, (train, test) in enumerate(cv_iter):
            sample_id = self.response_matrix.index[i]

            res_i = OLSModel(self.response_matrix.iloc[train],
                             self.design_matrix.iloc[train])
            res_i.fit(**kwargs)

            # model error
            predicted = res_i.predict(X=self.design_matrix.iloc[train])
            r = self.response_matrix.iloc[train].values
            p = predicted.values
            model_resid = ((r - p)**2)
            model_mse = np.mean(model_resid.sum(axis=0))
            results.loc[sample_id, 'model_mse'] = model_mse

            # prediction error
            predicted = res_i.predict(X=self.design_matrix.iloc[test])
            r = self.response_matrix.iloc[test].values
            p = predicted.values
            pred_resid = ((r - p)**2)
            pred_mse = np.mean(pred_resid.sum(axis=0))
            results.loc[sample_id, 'pred_mse'] = pred_mse

        return results
예제 #4
0
파일: _ols.py 프로젝트: ebolyen/gneiss
    def loo(self, **kwargs):
        """ Leave one out cross-validation.

        Calculates summary statistics for each iteraction of
        leave one out cross-validation, specially `mse` on entire model
        and `pred_err` to measure prediction error.

        Parameters
        ----------
        **kwargs : dict
           Keyword arguments used to tune the parameter estimation.

        Returns
        -------
        pd.DataFrame
           mse : np.array, float
               Mean sum of squares error for each iteration of
               the cross validation.
           pred_err : np.array, float
               Prediction mean sum of squares error for each iteration of
               the cross validation.

        See Also
        --------
        fit
        statsmodels.regression.linear_model.
        """

        nobs = self.balances.shape[0]  # number of observations (i.e. samples)
        cv_iter = LeaveOneOut(nobs)
        endog = self.balances
        exog_names = self.results[0].model.exog_names
        exog = pd.DataFrame(self.results[0].model.exog,
                            index=self.balances.index,
                            columns=exog_names)
        results = pd.DataFrame(index=self.balances.index,
                               columns=['mse', 'pred_err'])

        for i, (inidx, outidx) in enumerate(cv_iter):
            sample_id = self.balances.index[i]
            model_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs)
            res_i = [r.fit(**kwargs) for r in model_i]

            # mean sum of squares error
            sse = sum([r.ssr for r in res_i])
            # degrees of freedom for residuals
            dfe = res_i[0].df_resid
            results.loc[sample_id, 'mse'] = sse / dfe

            # prediction error on loo point
            predicted = np.hstack([r.predict(exog.loc[outidx]) for r in res_i])

            pred_sse = np.sum((predicted - self.balances.loc[outidx])**2)
            results.loc[sample_id, 'pred_err'] = pred_sse.sum()
        return results
예제 #5
0
파일: _ols.py 프로젝트: ebolyen/gneiss
    def lovo(self, **kwargs):
        """ Leave one variable out cross-validation.

        Calculates summary statistics for each iteraction of leave one variable
        out cross-validation, specially `r2` and `mse` on entire model.
        This technique is particularly useful for feature selection.

        Parameters
        ----------
        **kwargs : dict
           Keyword arguments used to tune the parameter estimation.

        Returns
        -------
        pd.DataFrame
           Rsquared : np.array, flot
               Coefficient of determination for each variable left out.
           mse : np.array, float
               Mean sum of squares error for each iteration of
               the cross validation.
        """
        endog = self.balances
        exog_names = self.results[0].model.exog_names
        exog = pd.DataFrame(self.results[0].model.exog,
                            index=self.balances.index,
                            columns=exog_names)
        cv_iter = LeaveOneOut(len(exog_names))
        results = pd.DataFrame(index=exog_names, columns=['mse', 'Rsquared'])
        for i, (inidx, outidx) in enumerate(cv_iter):
            feature_id = exog_names[i]
            res_i = _fit_ols(endog, exog.loc[:, inidx], **kwargs)
            res_i = [r.fit(**kwargs) for r in res_i]
            # See `statsmodels.regression.linear_model.RegressionResults`
            # for more explanation on `ess` and `ssr`.
            # sum of squares regression.
            ssr = sum([r.ess for r in res_i])
            # sum of squares error.
            sse = sum([r.ssr for r in res_i])
            # calculate the overall coefficient of determination (i.e. R2)
            sst = sse + ssr
            results.loc[feature_id, 'Rsquared'] = 1 - sse / sst
            # degrees of freedom for residuals
            dfe = res_i[0].df_resid
            results.loc[feature_id, 'mse'] = sse / dfe
        return results
예제 #6
0
    def lovo(self, **kwargs):
        """ Leave one variable out cross-validation.

        Calculates summary statistics for each iteraction of leave one variable
        out cross-validation, specially `r2` and `mse` on entire model.
        This technique is particularly useful for feature selection.

        Parameters
        ----------
        **kwargs : dict
           Keyword arguments used to tune the parameter estimation.

        Returns
        -------
        pd.DataFrame
           mse : np.array, float
               Mean sum of squares error for each iteration of
               the cross validation.
           Rsquared : np.array, float
               Coefficient of determination for each variable left out.
           R2diff : np.array, float
               Decrease in Rsquared for each variable left out.
        """
        cv_iter = LeaveOneOut(len(self.design_matrix.columns))
        results = pd.DataFrame(index=self.design_matrix.columns,
                               columns=['mse', 'Rsquared', 'R2diff'],
                               dtype=np.float64)
        for i, (inidx, outidx) in enumerate(cv_iter):

            feature_id = self.design_matrix.columns[i]

            res_i = OLSModel(Y=self.response_matrix,
                             Xs=self.design_matrix.iloc[:, inidx])
            res_i.fit(**kwargs)
            predicted = res_i.predict()
            r = self.response_matrix.values
            p = predicted.values

            model_resid = ((r - p)**2)
            model_mse = np.mean(model_resid.sum(axis=0))
            results.loc[feature_id, 'mse'] = model_mse
            results.loc[feature_id, 'Rsquared'] = res_i.r2
            results.loc[feature_id, 'R2diff'] = self.r2 - res_i.r2
        return results
예제 #7
0
    def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None):
        '''estimate the model and selection criteria for up to maxfact factors

        The selection criteria that are calculated are AIC, BIC, and R2_adj. and
        additionally cross-validation prediction error sum of squares if `skip_crossval`
        is false. Cross-validation is not used by default because it can be
        time consuming to calculate.

        By default the cross-validation method is Leave-one-out on the full dataset.
        A different cross-validation sample can be specified as an argument to
        cv_iter.

        Results are attached in `results_find_nfact`



        '''
        #print 'OLS on Factors'
        if not hasattr(self, 'factors'):
            self.calc_factors()

        hasconst = self.hasconst
        if maxfact is None:
            maxfact = self.factors.shape[1] - hasconst

        if (maxfact + hasconst) < 1:
            raise ValueError(
                'nothing to do, number of factors (incl. constant) should ' +
                'be at least 1')

        #temporary safety
        maxfact = min(maxfact, 10)

        y0 = self.endog
        results = []
        #xred, fact, eva, eve  = pca(x0, keepdim=0, normalize=1)
        for k in range(1, maxfact + hasconst):  #k includes now the constnat
            #xred, fact, eva, eve  = pca(x0, keepdim=k, normalize=1)
            # this is faster and same result
            fact = self.factors[:, :k]
            res = sm.OLS(y0, fact).fit()
            ##    print 'k =', k
            ##    print res.params
            ##    print 'aic:  ', res.aic
            ##    print 'bic:  ', res.bic
            ##    print 'llf:  ', res.llf
            ##    print 'R2    ', res.rsquared
            ##    print 'R2 adj', res.rsquared_adj

            if not skip_crossval:
                if cv_iter is None:
                    cv_iter = LeaveOneOut(len(y0))
                prederr2 = 0.
                for inidx, outidx in cv_iter:
                    res_l1o = sm.OLS(y0[inidx], fact[inidx, :]).fit()
                    #print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
                    prederr2 += (y0[outidx] - res_l1o.model.predict(
                        res_l1o.params, fact[outidx, :]))**2.
            else:
                prederr2 = np.nan

            results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2])

        self.results_find_nfact = results = np.array(results)
        self.best_nfact = np.r_[(np.argmin(results[:, 1:3],
                                           0), np.argmax(results[:, 3], 0),
                                 np.argmin(results[:, -1], 0))]
results = []
xred, fact, eva, eve = pca(x0, keepdim=0, normalize=1)
for k in range(0, x0.shape[1] + 1):
    #xred, fact, eva, eve  = pca(x0, keepdim=k, normalize=1)
    # this is faster and same result
    fact_wconst = sm.add_constant(fact[:, :k], prepend=False)
    res = sm.OLS(y0, fact_wconst).fit()
    ##    print 'k =', k
    ##    print res.params
    ##    print 'aic:  ', res.aic
    ##    print 'bic:  ', res.bic
    ##    print 'llf:  ', res.llf
    ##    print 'R2    ', res.rsquared
    ##    print 'R2 adj', res.rsquared_adj
    prederr2 = 0.
    for inidx, outidx in LeaveOneOut(len(y0)):
        resl1o = sm.OLS(y0[inidx], fact_wconst[inidx, :]).fit()
        #print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
        prederr2 += (y0[outidx] -
                     resl1o.model.predict(fact_wconst[outidx, :]))**2.
    results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2])

results = np.array(results)
print results
print 'best result for k, by AIC, BIC, R2_adj, L1O'
print np.r_[(np.argmin(results[:, 1:3],
                       0), np.argmax(results[:, 3],
                                     0), np.argmin(results[:, -1], 0))]

from statsmodels.iolib.table import (SimpleTable, default_txt_fmt,
                                     default_latex_fmt, default_html_fmt)
예제 #9
0
    def _res_looo(self):
        """collect required results from the LOOO loop

        all results will be attached.
        currently only 'params', 'mse_resid', 'det_cov_params' are stored

        Reestimates the model with endog and exog dropping one observation
        at a time

        This uses a nobs loop, only attributes of the results instance are
        stored.

        Warning: This will need refactoring and API changes to be able to
        add options.
        """
        from statsmodels.sandbox.tools.cross_val import LeaveOneOut
        get_det_cov_params = lambda res: np.linalg.det(res.cov_params())

        endog = self.results.model.endog
        exog = self.results.model.exog

        init_kwds = self.results.model._get_init_kwds()
        # We need to drop obs also from extra arrays
        freq_weights = init_kwds.pop('freq_weights')
        var_weights = init_kwds.pop('var_weights')
        offset = offset_ = init_kwds.pop('offset')
        exposure = exposure_ = init_kwds.pop('exposure')
        n_trials = init_kwds.pop('n_trials', None)
        # family Binomial creates `n` i.e. `n_trials`
        # we need to reset it
        # TODO: figure out how to do this properly
        if hasattr(init_kwds['family'], 'initialize'):
            # assume we have Binomial
            is_binomial = True
        else:
            is_binomial = False

        params = np.zeros(exog.shape, dtype=float)
        scale = np.zeros(endog.shape, dtype=float)
        det_cov_params = np.zeros(endog.shape, dtype=float)

        cv_iter = LeaveOneOut(self.nobs)
        for inidx, outidx in cv_iter:
            if offset is not None:
                offset_ = offset[inidx]
            if exposure is not None:
                exposure_ = exposure[inidx]
            if n_trials is not None:
                init_kwds['n_trials'] = n_trials[inidx]

            mod_i = self.model_class(endog[inidx],
                                     exog[inidx],
                                     offset=offset_,
                                     exposure=exposure_,
                                     freq_weights=freq_weights[inidx],
                                     var_weights=var_weights[inidx],
                                     **init_kwds)
            if is_binomial:
                mod_i.family.n = init_kwds['n_trials']
            res_i = mod_i.fit(start_params=self.results.params,
                              method='newton')
            params[outidx] = res_i.params.copy()
            scale[outidx] = res_i.scale
            det_cov_params[outidx] = get_det_cov_params(res_i)

        return dict(
            params=params,
            scale=scale,
            mse_resid=scale,
            # alias for now
            det_cov_params=det_cov_params)