Exemplo n.º 1
0
    def _plot_index(self, y, ylabel, threshold=None, title=None, ax=None,**kwds):
        from statsmodels.graphics import utils
        fig, ax = utils.create_mpl_ax(ax)
        if title is None:
            title = "Index Plot"
        nobs = len(self.endog)
        index = np.arange(nobs)
        ax.scatter(index, y, **kwds)

        if threshold == 'all':
            large_points = np.ones(nobs, np.bool_)
        else:
            large_points = np.abs(y) > threshold
        psize = 3 * np.ones(nobs)
        # add point labels
        labels = self.results.model.data.row_labels
        if labels is None:
            labels = np.arange(nobs)
        ax = utils.annotate_axes(np.where(large_points)[0], labels,
                                 lzip(index, y),
                                 lzip(-psize, psize), "large",
                                 ax)

        font = {"fontsize" : 16, "color" : "black"}
        ax.set_ylabel(ylabel, **font)
        ax.set_xlabel("Observation", **font)
        ax.set_title(title, **font)
        return fig
Exemplo n.º 2
0
def summary_params_2d(result, extras=None, endog_names=None, exog_names=None,
                      title=None):
    '''create summary table of regression parameters with several equations

    This allows interleaving of parameters with bse and/or tvalues

    Parameters
    ----------
    result : result instance
        the result instance with params and attributes in extras
    extras : list of strings
        additional attributes to add below a parameter row, e.g. bse or tvalues
    endog_names : None or list of strings
        names for rows of the parameter array (multivariate endog)
    exog_names : None or list of strings
        names for columns of the parameter array (exog)
    alpha : float
        level for confidence intervals, default 0.95
    title : None or string

    Returns
    -------
    tables : list of SimpleTable
        this contains a list of all seperate Subtables
    table_all : SimpleTable
        the merged table with results concatenated for each row of the parameter
        array

    '''
    if endog_names is None:
        # TODO: note the [1:] is specific to current MNLogit
        endog_names = ['endog_%d' % i for i in
                       np.unique(result.model.endog)[1:]]
    if exog_names is None:
        exog_names = ['var%d' % i for i in range(len(result.params))]

    # TODO: check formatting options with different values
    res_params = [[forg(item, prec=4) for item in row] for row in result.params]
    if extras:
        extras_list = [[['%10s' % ('(' + forg(v, prec=3).strip() + ')')
                         for v in col]
                        for col in getattr(result, what)]
                       for what in extras
                       ]
        data = lzip(res_params, *extras_list)
        data = [i for j in data for i in j]  #flatten
        stubs = lzip(endog_names, *[['']*len(endog_names)]*len(extras))
        stubs = [i for j in stubs for i in j] #flatten
    else:
        data = res_params
        stubs = endog_names

    txt_fmt = copy.deepcopy(fmt_params)
    txt_fmt["data_fmts"] = ["%s"]*result.params.shape[1]

    return SimpleTable(data, headers=exog_names,
                             stubs=stubs,
                             title=title,
                             txt_fmt=txt_fmt)
Exemplo n.º 3
0
def _influence_plot(results, influence, external=True, alpha=.05,
                    criterion="cooks", size=48, plot_alpha=.75, ax=None,
                    **kwargs):
    infl = influence
    fig, ax = utils.create_mpl_ax(ax)

    if criterion.lower().startswith('coo'):
        psize = infl.cooks_distance[0]
    elif criterion.lower().startswith('dff'):
        psize = np.abs(infl.dffits[0])
    else:
        raise ValueError("Criterion %s not understood" % criterion)

    # scale the variables
    #TODO: what is the correct scaling and the assumption here?
    #we want plots to be comparable across different plots
    #so we would need to use the expected distribution of criterion probably
    old_range = np.ptp(psize)
    new_range = size**2 - 8**2

    psize = (psize - psize.min()) * new_range/old_range + 8**2

    leverage = infl.hat_matrix_diag
    if external:
        resids = infl.resid_studentized_external
    else:
        resids = infl.resid_studentized

    from scipy import stats

    cutoff = stats.t.ppf(1.-alpha/2, results.df_resid)
    large_resid = np.abs(resids) > cutoff
    large_leverage = leverage > _high_leverage(results)
    large_points = np.logical_or(large_resid, large_leverage)

    ax.scatter(leverage, resids, s=psize, alpha=plot_alpha)

    # add point labels
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(len(resids))
    ax = utils.annotate_axes(np.where(large_points)[0], labels,
                             lzip(leverage, resids),
                             lzip(-(psize/2)**.5, (psize/2)**.5), "x-large",
                             ax)

    #TODO: make configurable or let people do it ex-post?
    font = {"fontsize" : 16, "color" : "black"}
    ax.set_ylabel("Studentized Residuals", **font)
    ax.set_xlabel("H Leverage", **font)
    ax.set_title("Influence Plot", **font)
    return fig
Exemplo n.º 4
0
def test_mcnemar_vectorized():
    ttk = np.random.randint(5,15, size=(2,2,3))
    mcnemar(ttk)
    res = mcnemar(ttk, exact=False)
    res1 = lzip(*[mcnemar(ttk[:,:,i], exact=False) for i in range(3)])
    assert_allclose(res, res1, rtol=1e-13)

    res = mcnemar(ttk, exact=False, correction=False)
    res1 = lzip(*[mcnemar(ttk[:,:,i], exact=False, correction=False)
                                                          for i in range(3)])
    assert_allclose(res, res1, rtol=1e-13)

    res = mcnemar(ttk, exact=True)
    res1 = lzip(*[mcnemar(ttk[:,:,i], exact=True) for i in range(3)])
    assert_allclose(res, res1, rtol=1e-13)
Exemplo n.º 5
0
def _plot_leverage_resid2(results, influence, alpha=.05, ax=None,
                         **kwargs):

    from scipy.stats import zscore, norm
    fig, ax = utils.create_mpl_ax(ax)

    infl = influence
    leverage = infl.hat_matrix_diag
    resid = zscore(infl.resid)
    ax.plot(resid**2, leverage, 'o', **kwargs)
    ax.set_xlabel("Normalized residuals**2")
    ax.set_ylabel("Leverage")
    ax.set_title("Leverage vs. Normalized residuals squared")

    large_leverage = leverage > _high_leverage(results)
    #norm or t here if standardized?
    cutoff = norm.ppf(1.-alpha/2)
    large_resid = np.abs(resid) > cutoff
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(int(results.nobs))
    index = np.where(np.logical_or(large_leverage, large_resid))[0]
    ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage),
                             [(0, 5)]*int(results.nobs), "large",
                             ax=ax, ha="center", va="bottom")
    ax.margins(.075, .075)
    return fig
Exemplo n.º 6
0
    def _coef_table(self):
        model = self.model
        k = model.neqs

        Xnames = self.model.exog_names

        data = lzip(model.params.T.ravel(),
                   model.stderr.T.ravel(),
                   model.tvalues.T.ravel(),
                   model.pvalues.T.ravel())

        header = ('coefficient','std. error','t-stat','prob')

        buf = StringIO()
        dim = k * model.k_ar + model.k_trend
        for i in range(k):
            section = "Results for equation %s" % model.names[i]
            buf.write(section + '\n')
            #print >> buf, section

            table = SimpleTable(data[dim * i : dim * (i + 1)], header,
                                Xnames, title=None, txt_fmt = self.default_fmt)
            buf.write(str(table) + '\n')

            if i < k - 1:
                buf.write('\n')

        return buf.getvalue()
Exemplo n.º 7
0
    def __init__(self, group, name=''):
        super(self.__class__, self).__init__(group, name=name)

        idx = (np.nonzero(np.diff(group))[0]+1).tolist()
        self.groupidx = groupidx = lzip([0]+idx, idx+[len(group)])

        ngroups = len(groupidx)
Exemplo n.º 8
0
    def add_dict(self, d, ncols=2, align='l', float_format="%.4f"):
        '''Add the contents of a Dict to summary table

        Parameters
        ----------
        d : dict
            Keys and values are automatically coerced to strings with str().
            Users are encouraged to format them before using add_dict.
        ncols: int
            Number of columns of the output table
        align : string
            Data alignment (l/c/r)
        '''

        keys = [_formatter(x, float_format) for x in iterkeys(d)]
        vals = [_formatter(x, float_format) for x in itervalues(d)]
        data = np.array(lzip(keys, vals))

        if data.shape[0] % ncols != 0:
            pad = ncols - (data.shape[0] % ncols)
            data = np.vstack([data, np.array(pad * [['', '']])])

        data = np.split(data, ncols)
        data = reduce(lambda x, y: np.hstack([x, y]), data)
        self.add_array(data, align=align)
Exemplo n.º 9
0
def _create_default_properties(data):
    """"Create the default properties of the mosaic given the data
    first it will varies the color hue (first category) then the color
    saturation (second category) and then the color value
    (third category).  If a fourth category is found, it will put
    decoration on the rectangle.  Doesn't manage more than four
    level of categories
    """
    categories_levels = _categories_level(list(iterkeys(data)))
    Nlevels = len(categories_levels)
    # first level, the hue
    L = len(categories_levels[0])
    # hue = np.linspace(1.0, 0.0, L+1)[:-1]
    hue = np.linspace(0.0, 1.0, L + 2)[:-2]
    # second level, the saturation
    L = len(categories_levels[1]) if Nlevels > 1 else 1
    saturation = np.linspace(0.5, 1.0, L + 1)[:-1]
    # third level, the value
    L = len(categories_levels[2]) if Nlevels > 2 else 1
    value = np.linspace(0.5, 1.0, L + 1)[:-1]
    # fourth level, the hatch
    L = len(categories_levels[3]) if Nlevels > 3 else 1
    hatch = ['', '/', '-', '|', '+'][:L + 1]
    # convert in list and merge with the levels
    hue = lzip(list(hue), categories_levels[0])
    saturation = lzip(list(saturation),
                     categories_levels[1] if Nlevels > 1 else [''])
    value = lzip(list(value),
                     categories_levels[2] if Nlevels > 2 else [''])
    hatch = lzip(list(hatch),
                     categories_levels[3] if Nlevels > 3 else [''])
    # create the properties dictionary
    properties = {}
    for h, s, v, t in product(hue, saturation, value, hatch):
        hv, hn = h
        sv, sn = s
        vv, vn = v
        tv, tn = t
        level = (hn,) + ((sn,) if sn else tuple())
        level = level + ((vn,) if vn else tuple())
        level = level + ((tn,) if tn else tuple())
        hsv = array([hv, sv, vv])
        prop = {'color': _single_hsv_to_rgb(hsv), 'hatch': tv, 'lw': 0}
        properties[level] = prop
    return properties
Exemplo n.º 10
0
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'):
    """
    Plot a reference line for a qqplot.

    Parameters
    ----------
    ax : matplotlib axes instance
        The axes on which to plot the line
    line : str {'45','r','s','q'}
        Options for the reference line to which the data is compared.:

        - '45' - 45-degree line
        - 's'  - standardized line, the expected order statistics are scaled by
                 the standard deviation of the given sample and have the mean
                 added to them
        - 'r'  - A regression line is fit
        - 'q'  - A line is fit through the quartiles.
        - None - By default no reference line is added to the plot.

    x : array
        X data for plot. Not needed if line is '45'.
    y : array
        Y data for plot. Not needed if line is '45'.
    dist : scipy.stats.distribution
        A scipy.stats distribution, needed if line is 'q'.

    Notes
    -----
    There is no return value. The line is plotted on the given `ax`.
    """
    if line == '45':
        end_pts = lzip(ax.get_xlim(), ax.get_ylim())
        end_pts[0] = min(end_pts[0])
        end_pts[1] = max(end_pts[1])
        ax.plot(end_pts, end_pts, fmt)
        ax.set_xlim(end_pts)
        ax.set_ylim(end_pts)
        return # does this have any side effects?
    if x is None and y is None:
        raise ValueError("If line is not 45, x and y cannot be None.")
    elif line == 'r':
        # could use ax.lines[0].get_xdata(), get_ydata(),
        # but don't know axes are 'clean'
        y = OLS(y, add_constant(x)).fit().fittedvalues
        ax.plot(x,y,fmt)
    elif line == 's':
        m,b = y.std(), y.mean()
        ref_line = x*m + b
        ax.plot(x, ref_line, fmt)
    elif line == 'q':
        _check_for_ppf(dist)
        q25 = stats.scoreatpercentile(y, 25)
        q75 = stats.scoreatpercentile(y, 75)
        theoretical_quartiles = dist.ppf([0.25, 0.75])
        m = (q75 - q25) / np.diff(theoretical_quartiles)
        b = q25 - m*theoretical_quartiles[0]
        ax.plot(x, m*x + b, fmt)
Exemplo n.º 11
0
    def pval_table(self):
        '''create a (n_levels, n_levels) array with corrected p_values

        this needs to improve, similar to R pairwise output
        '''
        k = self.n_levels
        pvals_mat = np.zeros((k, k))
        # if we don't assume we have all pairs
        pvals_mat[lzip(*self.all_pairs)] = self.pval_corrected()
        return pvals_mat
Exemplo n.º 12
0
def test_mcnemar_vectorized(reset_randomstate):
    ttk = np.random.randint(5,15, size=(2,2,3))
    with pytest.deprecated_call():
        res = sbmcnemar(ttk, exact=False)
    with pytest.deprecated_call():
        res1 = lzip(*[sbmcnemar(ttk[:, :, i], exact=False) for i in range(3)])
    assert_allclose(res, res1, rtol=1e-13)

    with pytest.deprecated_call():
        res = sbmcnemar(ttk, exact=False, correction=False)
    with pytest.deprecated_call():
        res1 = lzip(*[sbmcnemar(ttk[:, :, i], exact=False, correction=False)
                      for i in range(3)])
    assert_allclose(res, res1, rtol=1e-13)

    with pytest.deprecated_call():
        res = sbmcnemar(ttk, exact=True)
    with pytest.deprecated_call():
        res1 = lzip(*[sbmcnemar(ttk[:, :, i], exact=True) for i in range(3)])
    assert_allclose(res, res1, rtol=1e-13)
Exemplo n.º 13
0
    def summary_table(self, float_fmt="%6.3f"):
        '''create a summary table with all influence and outlier measures

        This does currently not distinguish between statistics that can be
        calculated from the original regression results and for which a
        leave-one-observation-out loop is needed

        Returns
        -------
        res : SimpleTable instance
           SimpleTable instance with the results, can be printed

        Notes
        -----
        This also attaches table_data to the instance.



        '''
        #print self.dfbetas

#        table_raw = [ np.arange(self.nobs),
#                      self.endog,
#                      self.fittedvalues,
#                      self.cooks_distance(),
#                      self.resid_studentized_internal,
#                      self.hat_matrix_diag,
#                      self.dffits_internal,
#                      self.resid_studentized_external,
#                      self.dffits,
#                      self.dfbetas
#                      ]
        table_raw = [ ('obs', np.arange(self.nobs)),
                      ('endog', self.endog),
                      ('fitted\nvalue', self.results.fittedvalues),
                      ("Cook's\nd", self.cooks_distance[0]),
                      ("student.\nresidual", self.resid_studentized_internal),
                      ('hat diag', self.hat_matrix_diag),
                      ('dffits \ninternal', self.dffits_internal[0]),
                      ("ext.stud.\nresidual", self.resid_studentized_external),
                      ('dffits', self.dffits[0])
                      ]
        colnames, data = lzip(*table_raw) #unzip
        data = np.column_stack(data)
        self.table_data = data
        from statsmodels.iolib.table import SimpleTable, default_html_fmt
        from statsmodels.iolib.tableformatting import fmt_base
        from copy import deepcopy
        fmt = deepcopy(fmt_base)
        fmt_html = deepcopy(default_html_fmt)
        fmt['data_fmts'] = ["%4d"] + [float_fmt] * (data.shape[1] - 1)
        #fmt_html['data_fmts'] = fmt['data_fmts']
        return SimpleTable(data, headers=colnames, txt_fmt=fmt,
                           html_fmt=fmt_html)
Exemplo n.º 14
0
    def in_domain(self, xs, ys, x):
        """
        Returns the filtered (xs, ys) based on the Kernel domain centred on x
        """
        # Disable black-list functions: filter used for speed instead of
        # list-comprehension
        # pylint: disable-msg=W0141
        def isInDomain(xy):
            """Used for filter to check if point is in the domain"""
            u = (xy[0]-x)/self.h
            return u >= self.domain[0] and u <= self.domain[1]

        if self.domain is None:
            return (xs, ys)
        else:
            filtered = lfilter(isInDomain, lzip(xs, ys))
            if len(filtered) > 0:
                xs, ys = lzip(*filtered)
                return (xs, ys)
            else:
                return ([], [])
Exemplo n.º 15
0
def getquotes(symbol, start, end):
    quotes = fin.quotes_historical_yahoo(symbol, start, end)
    dates, open, close, high, low, volume = lzip(*quotes)

    data = {
        'open' : open,
        'close' : close,
        'high' : high,
        'low' : low,
        'volume' : volume
    }

    dates = pa.Index([dt.datetime.fromordinal(int(d)) for d in dates])
    return pa.DataFrame(data, index=dates)
Exemplo n.º 16
0
def plot_leverage_resid2(results, alpha=.05, label_kwargs={}, ax=None,
                         **kwargs):
    """
    Plots leverage statistics vs. normalized residuals squared

    Parameters
    ----------
    results : results instance
        A regression results instance
    alpha : float
        Specifies the cut-off for large-standardized residuals. Residuals
        are assumed to be distributed N(0, 1) with alpha=alpha.
    label_kwargs : dict
        The keywords to pass to annotate for the labels.
    ax : Axes instance
        Matplotlib Axes instance

    Returns
    -------
    fig : matplotlib Figure
        A matplotlib figure instance.
    """
    from scipy.stats import zscore, norm
    fig, ax = utils.create_mpl_ax(ax)

    infl = results.get_influence()
    leverage = infl.hat_matrix_diag
    resid = zscore(results.resid)
    ax.plot(resid**2, leverage, 'o', **kwargs)
    ax.set_xlabel("Normalized residuals**2")
    ax.set_ylabel("Leverage")
    ax.set_title("Leverage vs. Normalized residuals squared")

    large_leverage = leverage > _high_leverage(results)
    #norm or t here if standardized?
    cutoff = norm.ppf(1.-alpha/2)
    large_resid = np.abs(resid) > cutoff
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(results.nobs)
    index = np.where(np.logical_or(large_leverage, large_resid))[0]
    ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage),
                             [(0, 5)]*int(results.nobs), "large",
                             ax=ax, ha="center", va="bottom")
    ax.margins(.075, .075)
    return fig
Exemplo n.º 17
0
    def conf_int(self, alpha=.05):
        """
        Returns the confidence intervals of the marginal effects

        Parameters
        ----------
        alpha : float
            Number between 0 and 1. The confidence intervals have the
            probability 1-alpha.

        Returns
        -------
        conf_int : ndarray
            An array with lower, upper confidence intervals for the marginal
            effects.
        """
        _check_at_is_all(self.margeff_options)
        me_se = self.margeff_se
        q = norm.ppf(1 - alpha / 2)
        lower = self.margeff - q * me_se
        upper = self.margeff + q * me_se
        return np.asarray(lzip(lower, upper))
Exemplo n.º 18
0
def proportions_chisquare_allpairs(count, nobs, multitest_method='hs'):
    '''chisquare test of proportions for all pairs of k samples

    Performs a chisquare test for proportions for all pairwise comparisons.
    The alternative is two-sided

    Parameters
    ----------
    count : integer or array_like
        the number of successes in nobs trials.
    nobs : integer
        the number of trials or observations.
    prop : float, optional
        The probability of success under the null hypothesis,
        `0 <= prop <= 1`. The default value is `prop = 0.5`
    multitest_method : string
        This chooses the method for the multiple testing p-value correction,
        that is used as default in the results.
        It can be any method that is available in  ``multipletesting``.
        The default is Holm-Sidak 'hs'.

    Returns
    -------
    result : AllPairsResults instance
        The returned results instance has several statistics, such as p-values,
        attached, and additional methods for using a non-default
        ``multitest_method``.

    Notes
    -----
    Yates continuity correction is not available.
    '''
    #all_pairs = lmap(list, lzip(*np.triu_indices(4, 1)))
    all_pairs = lzip(*np.triu_indices(len(count), 1))
    pvals = [proportions_chisquare(count[list(pair)], nobs[list(pair)])[1]
               for pair in all_pairs]
    return AllPairsResults(pvals, all_pairs, multitest_method=multitest_method)
Exemplo n.º 19
0
def pacf(x, nlags=40, method='ywunbiased', alpha=None):
    '''Partial autocorrelation estimated

    Parameters
    ----------
    x : 1d array
        observations of time series for which pacf is calculated
    nlags : int
        largest lag for which pacf is returned
    method : 'ywunbiased' (default) or 'ywmle' or 'ols'
        specifies which method for the calculations to use:

        - yw or ywunbiased : yule walker with bias correction in denominator
          for acovf
        - ywm or ywmle : yule walker without bias correction
        - ols - regression of time series on lags of it and on constant
        - ld or ldunbiased : Levinson-Durbin recursion with bias correction
        - ldb or ldbiased : Levinson-Durbin recursion without bias correction

    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        1/sqrt(len(x))

    Returns
    -------
    pacf : 1d array
        partial autocorrelations, nlags elements, including lag zero
    confint : array, optional
        Confidence intervals for the PACF. Returned if confint is not None.

    Notes
    -----
    This solves yule_walker equations or ols for each desired lag
    and contains currently duplicate calculations.
    '''

    if method == 'ols':
        ret = pacf_ols(x, nlags=nlags)
    elif method in ['yw', 'ywu', 'ywunbiased', 'yw_unbiased']:
        ret = pacf_yw(x, nlags=nlags, method='unbiased')
    elif method in ['ywm', 'ywmle', 'yw_mle']:
        ret = pacf_yw(x, nlags=nlags, method='mle')
    elif method in ['ld', 'ldu', 'ldunbiase', 'ld_unbiased']:
        acv = acovf(x, unbiased=True)
        ld_ = levinson_durbin(acv, nlags=nlags, isacov=True)
        #print 'ld', ld_
        ret = ld_[2]
    # inconsistent naming with ywmle
    elif method in ['ldb', 'ldbiased', 'ld_biased']:
        acv = acovf(x, unbiased=False)
        ld_ = levinson_durbin(acv, nlags=nlags, isacov=True)
        ret = ld_[2]
    else:
        raise ValueError('method not available')
    if alpha is not None:
        varacf = 1. / len(x) # for all lags >=1
        interval = stats.norm.ppf(1. - alpha / 2.) * np.sqrt(varacf)
        confint = np.array(lzip(ret - interval, ret + interval))
        confint[0] = ret[0]  # fix confidence interval for lag 0 to varpacf=0
        return ret, confint
    else:
        return ret
Exemplo n.º 20
0
def acf(x, unbiased=False, nlags=40, qstat=False, fft=False, alpha=None,
        missing='none'):
    """
    Autocorrelation function for 1d arrays.

    Parameters
    ----------
    x : array
       Time series data
    unbiased : bool
       If True, then denominators for autocovariance are n-k, otherwise n
    nlags: int, optional
        Number of lags to return autocorrelation for.
    qstat : bool, optional
        If True, returns the Ljung-Box q statistic for each autocorrelation
        coefficient.  See q_stat for more information.
    fft : bool, optional
        If True, computes the ACF via FFT.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett\'s formula.
    missing : str, optional
        A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs
        are to be treated.

    Returns
    -------
    acf : array
        autocorrelation function
    confint : array, optional
        Confidence intervals for the ACF. Returned if confint is not None.
    qstat : array, optional
        The Ljung-Box Q-Statistic.  Returned if q_stat is True.
    pvalues : array, optional
        The p-values associated with the Q-statistics.  Returned if q_stat is
        True.

    Notes
    -----
    The acf at lag 0 (ie., 1) is returned.

    This is based np.correlate which does full convolution. For very long time
    series it is recommended to use fft convolution instead.

    If unbiased is true, the denominator for the autocovariance is adjusted
    but the autocorrelation is not an unbiased estimtor.

    References
    ----------
    .. [1] Parzen, E., 1963. On spectral analysis with missing observations
       and amplitude modulation. Sankhya: The Indian Journal of
       Statistics, Series A, pp.383-392.

    """
    nobs = len(x)  # should this shrink for missing='drop' and NaNs in x?
    avf = acovf(x, unbiased=unbiased, demean=True, fft=fft, missing=missing)
    acf = avf[:nlags + 1] / avf[0]
    if not (qstat or alpha):
        return acf
    if alpha is not None:
        varacf = np.ones(nlags + 1) / nobs
        varacf[0] = 0
        varacf[1] = 1. / nobs
        varacf[2:] *= 1 + 2 * np.cumsum(acf[1:-1]**2)
        interval = stats.norm.ppf(1 - alpha / 2.) * np.sqrt(varacf)
        confint = np.array(lzip(acf - interval, acf + interval))
        if not qstat:
            return acf, confint
    if qstat:
        qstat, pvalue = q_stat(acf[1:], nobs=nobs)  # drop lag 0
        if alpha is not None:
            return acf, confint, qstat, pvalue
        else:
            return acf, qstat, pvalue
Exemplo n.º 21
0

_quarter_to_day = {
        "1" : (3, 31),
        "2" : (6, 30),
        "3" : (9, 30),
        "4" : (12, 31),
        "I" : (3, 31),
        "II" : (6, 30),
        "III" : (9, 30),
        "IV" : (12, 31)
        }


_mdays = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
_months_with_days = lzip(lrange(1,13), _mdays)
_month_to_day = dict(zip(map(str,lrange(1,13)), _months_with_days))
_month_to_day.update(dict(zip(["I", "II", "III", "IV", "V", "VI",
                               "VII", "VIII", "IX", "X", "XI", "XII"],
                               _months_with_days)))

# regex patterns
_y_pattern = '^\d?\d?\d?\d$'

_q_pattern = '''
^               # beginning of string
\d?\d?\d?\d     # match any number 1-9999, includes leading zeros

(:?q)           # use q or a : as a separator

([1-4]|(I{1,3}V?)) # match 1-4 or I-IV roman numerals
Exemplo n.º 22
0
def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds):
    """create new results instance with robust covariance as default

    Parameters
    ----------
    cov_type : string
        the type of robust sandwich estimator to use. see Notes below
    use_t : bool
        If true, then the t distribution is used for inference.
        If false, then the normal distribution is used.
    kwds : depends on cov_type
        Required or optional arguments for robust covariance calculation.
        see Notes below

    Returns
    -------
    results : results instance
        This method creates a new results instance with the requested
        robust covariance as the default covariance of the parameters.
        Inferential statistics like p-values and hypothesis tests will be
        based on this covariance matrix.

    Notes
    -----
    Warning: Some of the options and defaults in cov_kwds may be changed in a
    future version.

    The covariance keywords provide an option 'scaling_factor' to adjust the
    scaling of the covariance matrix, that is the covariance is multiplied by
    this factor if it is given and is not `None`. This allows the user to
    adjust the scaling of the covariance matrix to match other statistical
    packages.
    For example, `scaling_factor=(nobs - 1.) / (nobs - k_params)` provides a
    correction so that the robust covariance matrices match those of Stata in
    some models like GLM and discrete Models.

    The following covariance types and required or optional arguments are
    currently available:

    - 'HC0', 'HC1', 'HC2', 'HC3' and no keyword arguments:
        heteroscedasticity robust covariance
    - 'HAC' and keywords

        - `maxlag` integer (required) : number of lags to use
        - `kernel` string (optional) : kernel, default is Bartlett
        - `use_correction` bool (optional) : If true, use small sample
              correction

    - 'cluster' and required keyword `groups`, integer group indicator

        - `groups` array_like, integer (required) :
              index of clusters or groups
        - `use_correction` bool (optional) :
              If True the sandwich covariance is calulated with a small
              sample correction.
              If False the the sandwich covariance is calulated without
              small sample correction.
        - `df_correction` bool (optional)
              If True (default), then the degrees of freedom for the
              inferential statistics and hypothesis tests, such as
              pvalues, f_pvalue, conf_int, and t_test and f_test, are
              based on the number of groups minus one instead of the
              total number of observations minus the number of explanatory
              variables. `df_resid` of the results instance is adjusted.
              If False, then `df_resid` of the results instance is not
              adjusted.

    - 'hac-groupsum' Driscoll and Kraay, heteroscedasticity and
        autocorrelation robust standard errors in panel data
        keywords

        - `time` array_like (required) : index of time periods
        - `maxlag` integer (required) : number of lags to use
        - `kernel` string (optional) : kernel, default is Bartlett
        - `use_correction` False or string in ['hac', 'cluster'] (optional) :
              If False the the sandwich covariance is calulated without
              small sample correction.
              If `use_correction = 'cluster'` (default), then the same
              small sample correction as in the case of 'covtype='cluster''
              is used.
        - `df_correction` bool (optional)
              adjustment to df_resid, see cov_type 'cluster' above
              #TODO: we need more options here

    - 'hac-panel' heteroscedasticity and autocorrelation robust standard
        errors in panel data.
        The data needs to be sorted in this case, the time series for
        each panel unit or cluster need to be stacked. The membership to
        a timeseries of an individual or group can be either specified by
        group indicators or by increasing time periods.

        keywords

        - either `groups` or `time` : array_like (required)
          `groups` : indicator for groups
          `time` : index of time periods
        - `maxlag` integer (required) : number of lags to use
        - `kernel` string (optional) : kernel, default is Bartlett
        - `use_correction` False or string in ['hac', 'cluster'] (optional) :
              If False the the sandwich covariance is calulated without
              small sample correction.
        - `df_correction` bool (optional)
              adjustment to df_resid, see cov_type 'cluster' above
              #TODO: we need more options here

    Reminder:
    `use_correction` in "hac-groupsum" and "hac-panel" is not bool,
    needs to be in [False, 'hac', 'cluster']

    TODO: Currently there is no check for extra or misspelled keywords,
    except in the case of cov_type `HCx`

    """

    import statsmodels.stats.sandwich_covariance as sw

    #normalize names
    if cov_type == 'nw-panel':
        cov_type = 'hac-panel'
    if cov_type == 'nw-groupsum':
        cov_type = 'hac-groupsum'
    if 'kernel' in kwds:
            kwds['weights_func'] = kwds.pop('kernel')

    # pop because HCx raises if any kwds
    sc_factor = kwds.pop('scaling_factor', None)

    # TODO: make separate function that returns a robust cov plus info
    use_self = kwds.pop('use_self', False)
    if use_self:
        res = self
    else:
        # this doesn't work for most models, use raw instance instead from fit
        res = self.__class__(self.model, self.params,
                   normalized_cov_params=self.normalized_cov_params,
                   scale=self.scale)

    res.cov_type = cov_type
    # use_t might already be defined by the class, and already set
    if use_t is None:
        use_t = self.use_t
    res.cov_kwds = {'use_t':use_t}  # store for information
    res.use_t = use_t

    adjust_df = False
    if cov_type in ['cluster', 'hac-panel', 'hac-groupsum']:
        df_correction = kwds.get('df_correction', None)
        # TODO: check also use_correction, do I need all combinations?
        if df_correction is not False: # i.e. in [None, True]:
            # user didn't explicitely set it to False
            adjust_df = True

    res.cov_kwds['adjust_df'] = adjust_df

    # verify and set kwds, and calculate cov
    # TODO: this should be outsourced in a function so we can reuse it in
    #       other models
    # TODO: make it DRYer   repeated code for checking kwds
    if cov_type.upper() in ('HC0', 'HC1', 'HC2', 'HC3'):
        if kwds:
            raise ValueError('heteroscedasticity robust covarians ' +
                             'does not use keywords')
        res.cov_kwds['description'] = ('Standard Errors are heteroscedasticity ' +
                                       'robust ' + '(' + cov_type + ')')

        res.cov_params_default = getattr(self, 'cov_' + cov_type.upper(), None)
        if res.cov_params_default is None:
            # results classes that don't have cov_HCx attribute
            res.cov_params_default = sw.cov_white_simple(self,
                                                         use_correction=False)
    elif cov_type.lower() == 'hac':
        maxlags = kwds['maxlags']   # required?, default in cov_hac_simple
        res.cov_kwds['maxlags'] = maxlags
        weights_func = kwds.get('weights_func', sw.weights_bartlett)
        res.cov_kwds['weights_func'] = weights_func
        use_correction = kwds.get('use_correction', False)
        res.cov_kwds['use_correction'] = use_correction
        res.cov_kwds['description'] = ('Standard Errors are heteroscedasticity ' +
             'and autocorrelation robust (HAC) using %d lags and %s small ' +
             'sample correction') % (maxlags, ['without', 'with'][use_correction])

        res.cov_params_default = sw.cov_hac_simple(self, nlags=maxlags,
                                             weights_func=weights_func,
                                             use_correction=use_correction)
    elif cov_type.lower() == 'cluster':
        #cluster robust standard errors, one- or two-way
        groups = kwds['groups']
        if not hasattr(groups, 'shape'):
            groups = np.asarray(groups).T

        if groups.ndim >= 2:
            groups = groups.squeeze()

        res.cov_kwds['groups'] = groups
        use_correction = kwds.get('use_correction', True)
        res.cov_kwds['use_correction'] = use_correction
        if groups.ndim == 1:
            if adjust_df:
                # need to find number of groups
                # duplicate work
                self.n_groups = n_groups = len(np.unique(groups))
            res.cov_params_default = sw.cov_cluster(self, groups,
                                             use_correction=use_correction)

        elif groups.ndim == 2:
            if hasattr(groups, 'values'):
                groups = groups.values

            if adjust_df:
                # need to find number of groups
                # duplicate work
                n_groups0 = len(np.unique(groups[:,0]))
                n_groups1 = len(np.unique(groups[:, 1]))
                self.n_groups = (n_groups0, n_groups1)
                n_groups = min(n_groups0, n_groups1) # use for adjust_df

            # Note: sw.cov_cluster_2groups has 3 returns
            res.cov_params_default = sw.cov_cluster_2groups(self, groups,
                                         use_correction=use_correction)[0]
        else:
            raise ValueError('only two groups are supported')
        res.cov_kwds['description'] = ('Standard Errors are robust to' +
                            'cluster correlation ' + '(' + cov_type + ')')

    elif cov_type.lower() == 'hac-panel':
        #cluster robust standard errors
        res.cov_kwds['time'] = time = kwds.get('time', None)
        res.cov_kwds['groups'] = groups = kwds.get('groups', None)
        #TODO: nlags is currently required
        #nlags = kwds.get('nlags', True)
        #res.cov_kwds['nlags'] = nlags
        #TODO: `nlags` or `maxlags`
        res.cov_kwds['maxlags'] = maxlags = kwds['maxlags']
        use_correction = kwds.get('use_correction', 'hac')
        res.cov_kwds['use_correction'] = use_correction
        weights_func = kwds.get('weights_func', sw.weights_bartlett)
        res.cov_kwds['weights_func'] = weights_func
        # TODO: clumsy time index in cov_nw_panel
        if groups is not None:
            groups = np.asarray(groups)
            tt = (np.nonzero(groups[:-1] != groups[1:])[0] + 1).tolist()
            nobs_ = len(groups)
        elif time is not None:
            # TODO: clumsy time index in cov_nw_panel
            time = np.asarray(time)
            tt = (np.nonzero(time[1:] < time[:-1])[0] + 1).tolist()
            nobs_ = len(time)
        else:
            raise ValueError('either time or groups needs to be given')
        groupidx = lzip([0] + tt, tt + [nobs_])
        self.n_groups = n_groups = len(groupidx)
        res.cov_params_default = sw.cov_nw_panel(self, maxlags, groupidx,
                                            weights_func=weights_func,
                                            use_correction=use_correction)
        res.cov_kwds['description'] = ('Standard Errors are robust to' +
                            'cluster correlation ' + '(' + cov_type + ')')
    elif cov_type.lower() == 'hac-groupsum':
        # Driscoll-Kraay standard errors
        res.cov_kwds['time'] = time = kwds['time']
        #TODO: nlags is currently required
        #nlags = kwds.get('nlags', True)
        #res.cov_kwds['nlags'] = nlags
        #TODO: `nlags` or `maxlags`
        res.cov_kwds['maxlags'] = maxlags = kwds['maxlags']
        use_correction = kwds.get('use_correction', 'cluster')
        res.cov_kwds['use_correction'] = use_correction
        weights_func = kwds.get('weights_func', sw.weights_bartlett)
        res.cov_kwds['weights_func'] = weights_func
        if adjust_df:
            # need to find number of groups
            tt = (np.nonzero(time[1:] < time[:-1])[0] + 1)
            self.n_groups = n_groups = len(tt) + 1
        res.cov_params_default = sw.cov_nw_groupsum(self, maxlags, time,
                                        weights_func=weights_func,
                                        use_correction=use_correction)
        res.cov_kwds['description'] = (
                    'Driscoll and Kraay Standard Errors are robust to ' +
                    'cluster correlation ' + '(' + cov_type + ')')
    else:
        raise ValueError('cov_type not recognized. See docstring for ' +
                         'available options and spelling')

    # generic optional factor to scale covariance

    res.cov_kwds['scaling_factor'] = sc_factor
    if sc_factor is not None:
        res.cov_params_default *= sc_factor

    if adjust_df:
        # Note: df_resid is used for scale and others, add new attribute
        res.df_resid_inference = n_groups - 1

    return res
Exemplo n.º 23
0
    def __init__(self, group, name=''):
        super(self.__class__, self).__init__(group, name=name)

        idx = (np.nonzero(np.diff(group))[0] + 1).tolist()
        self.groupidx = lzip([0] + idx, idx + [len(group)])
Exemplo n.º 24
0
def qqline(ax, line, x=None, y=None, dist=None, fmt="r-", **lineoptions):
    """
    Plot a reference line for a qqplot.

    Parameters
    ----------
    ax : matplotlib axes instance
        The axes on which to plot the line
    line : str {"45","r","s","q"}
        Options for the reference line to which the data is compared.:

        - "45" - 45-degree line
        - "s"  - standardized line, the expected order statistics are scaled by
                 the standard deviation of the given sample and have the mean
                 added to them
        - "r"  - A regression line is fit
        - "q"  - A line is fit through the quartiles.
        - None - By default no reference line is added to the plot.

    x : ndarray
        X data for plot. Not needed if line is "45".
    y : ndarray
        Y data for plot. Not needed if line is "45".
    dist : scipy.stats.distribution
        A scipy.stats distribution, needed if line is "q".
    fmt : str, optional
        Line format string passed to `plot`.
    **lineoptions
        Additional arguments to be passed to the `plot` command.

    Notes
    -----
    There is no return value. The line is plotted on the given `ax`.

    Examples
    --------
    Import the food expenditure dataset.  Plot annual food expenditure on x-axis
    and household income on y-axis.  Use qqline to add regression line into the
    plot.

    >>> import statsmodels.api as sm
    >>> import numpy as np
    >>> import matplotlib.pyplot as plt
    >>> from statsmodels.graphics.gofplots import qqline

    >>> foodexp = sm.datasets.engel.load()
    >>> x = foodexp.exog
    >>> y = foodexp.endog
    >>> ax = plt.subplot(111)
    >>> plt.scatter(x, y)
    >>> ax.set_xlabel(foodexp.exog_name[0])
    >>> ax.set_ylabel(foodexp.endog_name)
    >>> qqline(ax, "r", x, y)
    >>> plt.show()

    .. plot:: plots/graphics_gofplots_qqplot_qqline.py
    """
    lineoptions = lineoptions.copy()
    for ls in ("-", "--", "-.", ":"):
        if ls in fmt:
            lineoptions.setdefault("linestyle", ls)
            fmt = fmt.replace(ls, "")
            break
    for marker in (
            ".",
            ",",
            "o",
            "v",
            "^",
            "<",
            ">",
            "1",
            "2",
            "3",
            "4",
            "8",
            "s",
            "p",
            "P",
            "*",
            "h",
            "H",
            "+",
            "x",
            "X",
            "D",
            "d",
            "|",
            "_",
    ):
        if marker in fmt:
            lineoptions.setdefault("marker", marker)
            fmt = fmt.replace(marker, "")
            break
    if fmt:
        lineoptions.setdefault("color", fmt)

    if line == "45":
        end_pts = lzip(ax.get_xlim(), ax.get_ylim())
        end_pts[0] = min(end_pts[0])
        end_pts[1] = max(end_pts[1])
        ax.plot(end_pts, end_pts, **lineoptions)
        ax.set_xlim(end_pts)
        ax.set_ylim(end_pts)
        return  # does this have any side effects?
    if x is None or y is None:
        raise ValueError("If line is not 45, x and y cannot be None.")
    x = np.array(x)
    y = np.array(y)
    if line == "r":
        # could use ax.lines[0].get_xdata(), get_ydata(),
        # but don't know axes are "clean"
        y = OLS(y, add_constant(x)).fit().fittedvalues
        ax.plot(x, y, **lineoptions)
    elif line == "s":
        m, b = np.std(y), np.mean(y)
        ref_line = x * m + b
        ax.plot(x, ref_line, **lineoptions)
    elif line == "q":
        _check_for(dist, "ppf")
        q25 = stats.scoreatpercentile(y, 25)
        q75 = stats.scoreatpercentile(y, 75)
        theoretical_quartiles = dist.ppf([0.25, 0.75])
        m = (q75 - q25) / np.diff(theoretical_quartiles)
        b = q25 - m * theoretical_quartiles[0]
        ax.plot(x, m * x + b, **lineoptions)
Exemplo n.º 25
0
def pacf(x, nlags=40, method='ywunbiased', alpha=None):
    """
    Partial autocorrelation estimated

    Parameters
    ----------
    x : 1d array
        observations of time series for which pacf is calculated
    nlags : int
        largest lag for which pacf is returned
    method : {'ywunbiased', 'ywmle', 'ols'}
        specifies which method for the calculations to use:

        - yw or ywunbiased : yule walker with bias correction in denominator
          for acovf. Default.
        - ywm or ywmle : yule walker without bias correction
        - ols - regression of time series on lags of it and on constant
        - ld or ldunbiased : Levinson-Durbin recursion with bias correction
        - ldb or ldbiased : Levinson-Durbin recursion without bias correction
    alpha : float, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        1/sqrt(len(x))

    Returns
    -------
    pacf : 1d array
        partial autocorrelations, nlags elements, including lag zero
    confint : array, optional
        Confidence intervals for the PACF. Returned if confint is not None.

    Notes
    -----
    This solves yule_walker equations or ols for each desired lag
    and contains currently duplicate calculations.
    """

    if method == 'ols':
        ret = pacf_ols(x, nlags=nlags)
    elif method in ['yw', 'ywu', 'ywunbiased', 'yw_unbiased']:
        ret = pacf_yw(x, nlags=nlags, method='unbiased')
    elif method in ['ywm', 'ywmle', 'yw_mle']:
        ret = pacf_yw(x, nlags=nlags, method='mle')
    elif method in ['ld', 'ldu', 'ldunbiase', 'ld_unbiased']:
        acv = acovf(x, unbiased=True)
        ld_ = levinson_durbin(acv, nlags=nlags, isacov=True)
        #print 'ld', ld_
        ret = ld_[2]
    # inconsistent naming with ywmle
    elif method in ['ldb', 'ldbiased', 'ld_biased']:
        acv = acovf(x, unbiased=False)
        ld_ = levinson_durbin(acv, nlags=nlags, isacov=True)
        ret = ld_[2]
    else:
        raise ValueError('method not available')
    if alpha is not None:
        varacf = 1. / len(x)  # for all lags >=1
        interval = stats.norm.ppf(1. - alpha / 2.) * np.sqrt(varacf)
        confint = np.array(lzip(ret - interval, ret + interval))
        confint[0] = ret[0]  # fix confidence interval for lag 0 to varpacf=0
        return ret, confint
    else:
        return ret
Exemplo n.º 26
0
def genfromdta(fname,
               missing_flt=-999.,
               encoding=None,
               pandas=False,
               convert_dates=True):
    """
    Returns an ndarray or DataFrame from a Stata .dta file.

    Parameters
    ----------
    fname : str or filehandle
        Stata .dta file.
    missing_flt : numeric
        The numeric value to replace missing values with. Will be used for
        any numeric value.
    encoding : string, optional
        Used for Python 3 only. Encoding to use when reading the .dta file.
        Defaults to `locale.getpreferredencoding`
    pandas : bool
        Optionally return a DataFrame instead of an ndarray
    convert_dates : bool
        If convert_dates is True, then Stata formatted dates will be converted
        to datetime types according to the variable's format.
    """
    if isinstance(fname, string_types):
        fhd = StataReader(open(fname, 'rb'),
                          missing_values=False,
                          encoding=encoding)
    elif not hasattr(fname, 'read'):
        raise TypeError("The input should be a string or a filehandle. "\
                        "(got %s instead)" % type(fname))
    else:
        fhd = StataReader(fname, missing_values=False, encoding=encoding)


#    validate_names = np.lib._iotools.NameValidator(excludelist=excludelist,
#                                    deletechars=deletechars,
#                                    case_sensitive=case_sensitive)

#TODO: This needs to handle the byteorder?
    header = fhd.file_headers()
    types = header['dtyplist']
    nobs = header['nobs']
    numvars = header['nvar']
    varnames = header['varlist']
    fmtlist = header['fmtlist']
    dataname = header['data_label']
    labels = header['vlblist']  # labels are thrown away unless DataArray
    # type is used
    data = np.zeros((nobs, numvars))
    stata_dta = fhd.dataset()

    dt = np.dtype(lzip(varnames, types))
    data = np.zeros((nobs), dtype=dt)  # init final array

    for rownum, line in enumerate(stata_dta):
        # doesn't handle missing value objects, just casts
        # None will only work without missing value object.
        if None in line:
            for i, val in enumerate(line):
                #NOTE: This will only be scalar types because missing strings
                # are empty not None in Stata
                if val is None:
                    line[i] = missing_flt
        data[rownum] = tuple(line)

    if pandas:
        from pandas import DataFrame
        data = DataFrame.from_records(data)
        if convert_dates:
            cols = np.where(lmap(lambda x: x in _date_formats, fmtlist))[0]
            for col in cols:
                i = col
                col = data.columns[col]
                data[col] = data[col].apply(_stata_elapsed_date_to_datetime,
                                            args=(fmtlist[i], ))
    elif convert_dates:
        #date_cols = np.where(map(lambda x : x in _date_formats,
        #                                                    fmtlist))[0]
        # make the dtype for the datetime types
        cols = np.where(lmap(lambda x: x in _date_formats, fmtlist))[0]
        dtype = data.dtype.descr
        dtype = [(dt[0], object) if i in cols else dt
                 for i, dt in enumerate(dtype)]
        data = data.astype(dtype)  # have to copy
        for col in cols:

            def convert(x):
                return _stata_elapsed_date_to_datetime(x, fmtlist[col])

            data[data.dtype.names[col]] = lmap(convert,
                                               data[data.dtype.names[col]])
    return data
Exemplo n.º 27
0
class StataReader(object):
    """
    Stata .dta file reader.

    Provides methods to return the metadata of a Stata .dta file and
    a generator for the data itself.

    Parameters
    ----------
    file : file-like
        A file-like object representing a Stata .dta file.
    missing_values : bool
        If missing_values is True, parse missing_values and return a
        Missing Values object instead of None.
    encoding : string, optional
        Used for Python 3 only. Encoding to use when reading the .dta file.
        Defaults to `locale.getpreferredencoding`

    See also
    --------
    statsmodels.lib.io.genfromdta

    Notes
    -----
    This is known only to work on file formats 113 (Stata 8/9), 114
    (Stata 10/11), and 115 (Stata 12).  Needs to be tested on older versions.
    Known not to work on format 104, 108. If you have the documentation for
    older formats, please contact the developers.

    For more information about the .dta format see
    http://www.stata.com/help.cgi?dta
    http://www.stata.com/help.cgi?dta_113
    """

    _header = {}
    _data_location = 0
    _col_sizes = ()
    _has_string_data = False
    _missing_values = False
    #type          code
    #--------------------
    #str1        1 = 0x01
    #str2        2 = 0x02
    #...
    #str244    244 = 0xf4
    #byte      251 = 0xfb  (sic)
    #int       252 = 0xfc
    #long      253 = 0xfd
    #float     254 = 0xfe
    #double    255 = 0xff
    #--------------------
    #NOTE: the byte type seems to be reserved for categorical variables
    # with a label, but the underlying variable is -127 to 100
    # we're going to drop the label and cast to int
    DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \
                    [(251, np.int16),(252, np.int32),(253, int),
                        (254, np.float32), (255, np.float64)])
    TYPE_MAP = lrange(251) + list('bhlfd')
    #NOTE: technically, some of these are wrong. there are more numbers
    # that can be represented. it's the 27 ABOVE and BELOW the max listed
    # numeric data type in [U] 12.2.2 of the 11.2 manual
    MISSING_VALUES = {
        'b': (-127, 100),
        'h': (-32767, 32740),
        'l': (-2147483647, 2147483620),
        'f': (-1.701e+38, +1.701e+38),
        'd': (-1.798e+308, +8.988e+307)
    }

    def __init__(self, fname, missing_values=False, encoding=None):
        if encoding == None:
            import locale
            self._encoding = locale.getpreferredencoding()
        else:
            self._encoding = encoding
        self._missing_values = missing_values
        self._parse_header(fname)

    def file_headers(self):
        """
        Returns all .dta file headers.

        out: dict
            Has keys typlist, data_label, lbllist, varlist, nvar, filetype,
            ds_format, nobs, fmtlist, vlblist, time_stamp, srtlist, byteorder
        """
        return self._header

    def file_format(self):
        """
        Returns the file format.

        Returns
        -------
        out : int

        Notes
        -----
        Format 113: Stata 8/9
        Format 114: Stata 10/11
        Format 115: Stata 12
        """
        return self._header['ds_format']

    def file_label(self):
        """
        Returns the dataset's label.

        Returns
        -------
        out: string
        """
        return self._header['data_label']

    def file_timestamp(self):
        """
        Returns the date and time Stata recorded on last file save.

        Returns
        -------
        out : str
        """
        return self._header['time_stamp']

    def variables(self):
        """
        Returns a list of the dataset's StataVariables objects.
        """
        return lmap(
            _StataVariable,
            zip(lrange(self._header['nvar']), self._header['typlist'],
                self._header['varlist'], self._header['srtlist'],
                self._header['fmtlist'], self._header['lbllist'],
                self._header['vlblist']))

    def dataset(self, as_dict=False):
        """
        Returns a Python generator object for iterating over the dataset.


        Parameters
        ----------
        as_dict : bool, optional
            If as_dict is True, yield each row of observations as a dict.
            If False, yields each row of observations as a list.

        Returns
        -------
        Generator object for iterating over the dataset.  Yields each row of
        observations as a list by default.

        Notes
        -----
        If missing_values is True during instantiation of StataReader then
        observations with _StataMissingValue(s) are not filtered and should
        be handled by your applcation.
        """

        try:
            self._file.seek(self._data_location)
        except Exception:
            pass

        if as_dict:
            vars = lmap(str, self.variables())
            for i in range(len(self)):
                yield dict(zip(vars, self._next()))
        else:
            for i in range(self._header['nobs']):
                yield self._next()

    ### Python special methods

    def __len__(self):
        """
        Return the number of observations in the dataset.

        This value is taken directly from the header and includes observations
        with missing values.
        """
        return self._header['nobs']

    def __getitem__(self, k):
        """
        Seek to an observation indexed k in the file and return it, ordered
        by Stata's output to the .dta file.

        k is zero-indexed.  Prefer using R.data() for performance.
        """
        if not (isinstance(k, (int, long))) or k < 0 or k > len(self) - 1:
            raise IndexError(k)
        loc = self._data_location + sum(self._col_size()) * k
        if self._file.tell() != loc:
            self._file.seek(loc)
        return self._next()

    ### Private methods

    def _null_terminate(self, s, encoding):
        if PY3:  # have bytes not strings, so must decode
            null_byte = asbytes('\x00')
            try:
                s = s.lstrip(null_byte)[:s.index(null_byte)]
            except:
                pass
            return s.decode(encoding)
        else:
            null_byte = asbytes('\x00')
            try:
                return s.lstrip(null_byte)[:s.index(null_byte)]
            except:
                return s

    def _parse_header(self, file_object):
        self._file = file_object
        encoding = self._encoding

        # parse headers
        self._header['ds_format'] = unpack('b', self._file.read(1))[0]

        if self._header['ds_format'] not in [113, 114, 115]:
            raise ValueError("Only file formats >= 113 (Stata >= 9)"
                             " are supported.  Got format %s.  Please report "
                             "if you think this error is incorrect." %
                             self._header['ds_format'])
        byteorder = self._header['byteorder'] = unpack(
            'b', self._file.read(1))[0] == 0x1 and '>' or '<'
        self._header['filetype'] = unpack('b', self._file.read(1))[0]
        self._file.read(1)
        nvar = self._header['nvar'] = unpack(byteorder + 'h',
                                             self._file.read(2))[0]
        self._header['nobs'] = unpack(byteorder + 'i', self._file.read(4))[0]
        self._header['data_label'] = self._null_terminate(
            self._file.read(81), encoding)
        self._header['time_stamp'] = self._null_terminate(
            self._file.read(18), encoding)

        # parse descriptors
        typlist = [ord(self._file.read(1)) for i in range(nvar)]
        self._header['typlist'] = [self.TYPE_MAP[typ] for typ in typlist]
        self._header['dtyplist'] = [self.DTYPE_MAP[typ] for typ in typlist]
        self._header['varlist'] = [
            self._null_terminate(self._file.read(33), encoding)
            for i in range(nvar)
        ]
        self._header['srtlist'] = unpack(byteorder + ('h' * (nvar + 1)),
                                         self._file.read(2 * (nvar + 1)))[:-1]
        if self._header['ds_format'] <= 113:
            self._header['fmtlist'] = \
                    [self._null_terminate(self._file.read(12), encoding) \
                    for i in range(nvar)]
        else:
            self._header['fmtlist'] = \
                    [self._null_terminate(self._file.read(49), encoding) \
                    for i in range(nvar)]
        self._header['lbllist'] = [
            self._null_terminate(self._file.read(33), encoding)
            for i in range(nvar)
        ]
        self._header['vlblist'] = [
            self._null_terminate(self._file.read(81), encoding)
            for i in range(nvar)
        ]

        # ignore expansion fields
        # When reading, read five bytes; the last four bytes now tell you the
        # size of the next read, which you discard.  You then continue like
        # this until you read 5 bytes of zeros.

        while True:
            data_type = unpack(byteorder + 'b', self._file.read(1))[0]
            data_len = unpack(byteorder + 'i', self._file.read(4))[0]
            if data_type == 0:
                break
            self._file.read(data_len)

        # other state vars
        self._data_location = self._file.tell()
        self._has_string_data = len(
            lfilter(lambda x: isinstance(x, int), self._header['typlist'])) > 0
        self._col_size()

    def _calcsize(self, fmt):
        return isinstance(fmt, int) and fmt or \
                calcsize(self._header['byteorder']+fmt)

    def _col_size(self, k=None):
        """Calculate size of a data record."""
        if len(self._col_sizes) == 0:
            self._col_sizes = lmap(lambda x: self._calcsize(x),
                                   self._header['typlist'])
        if k == None:
            return self._col_sizes
        else:
            return self._col_sizes[k]

    def _unpack(self, fmt, byt):
        d = unpack(self._header['byteorder'] + fmt, byt)[0]
        if fmt[-1] in self.MISSING_VALUES:
            nmin, nmax = self.MISSING_VALUES[fmt[-1]]
            if d < nmin or d > nmax:
                if self._missing_values:
                    return _StataMissingValue(nmax, d)
                else:
                    return None
        return d

    def _next(self):
        typlist = self._header['typlist']
        if self._has_string_data:
            data = [None] * self._header['nvar']
            for i in range(len(data)):
                if isinstance(typlist[i], int):
                    data[i] = self._null_terminate(self._file.read(typlist[i]),
                                                   self._encoding)
                else:
                    data[i] = self._unpack(typlist[i],
                                           self._file.read(self._col_size(i)))
            return data
        else:
            return lmap(
                lambda i: self._unpack(typlist[i],
                                       self._file.read(self._col_size(i))),
                lrange(self._header['nvar']))
Exemplo n.º 28
0
class StataWriter(object):
    """
    A class for writing Stata binary dta files from array-like objects

    Parameters
    ----------
    fname : file path or buffer
        Where to save the dta file.
    data : array-like
        Array-like input to save. Pandas objects are also accepted.
    convert_dates : dict
        Dictionary mapping column of datetime types to the stata internal
        format that you want to use for the dates. Options are
        'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
        number or a name.
    encoding : str
        Default is latin-1. Note that Stata does not support unicode.
    byteorder : str
        Can be ">", "<", "little", or "big". The default is None which uses
        `sys.byteorder`

    Returns
    -------
    writer : StataWriter instance
        The StataWriter instance has a write_file method, which will
        write the file to the given `fname`.

    Examples
    --------
    >>> writer = StataWriter('./data_file.dta', data)
    >>> writer.write_file()

    Or with dates

    >>> writer = StataWriter('./date_data_file.dta', date, {2 : 'tw'})
    >>> writer.write_file()
    """
    #type          code
    #--------------------
    #str1        1 = 0x01
    #str2        2 = 0x02
    #...
    #str244    244 = 0xf4
    #byte      251 = 0xfb  (sic)
    #int       252 = 0xfc
    #long      253 = 0xfd
    #float     254 = 0xfe
    #double    255 = 0xff
    #--------------------
    #NOTE: the byte type seems to be reserved for categorical variables
    # with a label, but the underlying variable is -127 to 100
    # we're going to drop the label and cast to int
    DTYPE_MAP = dict(lzip(lrange(1,245), ['a' + str(i) for i in range(1,245)]) + \
                    [(251, np.int16),(252, np.int32),(253, int),
                        (254, np.float32), (255, np.float64)])
    TYPE_MAP = lrange(251) + list('bhlfd')
    MISSING_VALUES = {
        'b': 101,
        'h': 32741,
        'l': 2147483621,
        'f': 1.7014118346046923e+38,
        'd': 8.98846567431158e+307
    }

    def __init__(self,
                 fname,
                 data,
                 convert_dates=None,
                 encoding="latin-1",
                 byteorder=None):

        self._convert_dates = convert_dates
        # attach nobs, nvars, data, varlist, typlist
        if data_util._is_using_pandas(data, None):
            self._prepare_pandas(data)

        elif data_util._is_array_like(data, None):
            data = np.asarray(data)
            if data_util._is_structured_ndarray(data):
                self._prepare_structured_array(data)
            else:
                if convert_dates is not None:
                    raise ValueError("Not able to convert dates in a plain"
                                     " ndarray.")
                self._prepare_ndarray(data)

        else:  # pragma : no cover
            raise ValueError("Type %s for data not understood" % type(data))

        if byteorder is None:
            byteorder = sys.byteorder
        self._byteorder = _set_endianness(byteorder)
        self._encoding = encoding
        self._file = get_file_obj(fname, 'wb', encoding)

    def _write(self, to_write):
        """
        Helper to call asbytes before writing to file for Python 3 compat.
        """
        self._file.write(asbytes(to_write))

    def _prepare_structured_array(self, data):
        self.nobs = len(data)
        self.nvar = len(data.dtype)
        self.data = data
        self.datarows = iter(data)
        dtype = data.dtype
        descr = dtype.descr
        if dtype.names is None:
            varlist = _default_names(nvar)
        else:
            varlist = dtype.names

        # check for datetime and change the type
        convert_dates = self._convert_dates
        if convert_dates is not None:
            convert_dates = _maybe_convert_to_int_keys(convert_dates, varlist)
            self._convert_dates = convert_dates
            for key in convert_dates:
                descr[key] = (descr[key][0],
                              _convert_datetime_to_stata_type(
                                  convert_dates[key]))
            dtype = np.dtype(descr)

        self.varlist = varlist
        self.typlist = [
            _dtype_to_stata_type(dtype[i]) for i in range(self.nvar)
        ]
        self.fmtlist = [
            _dtype_to_default_stata_fmt(dtype[i]) for i in range(self.nvar)
        ]
        # set the given format for the datetime cols
        if convert_dates is not None:
            for key in convert_dates:
                self.fmtlist[key] = convert_dates[key]

    def _prepare_ndarray(self, data):
        if data.ndim == 1:
            data = data[:, None]
        self.nobs, self.nvar = data.shape
        self.data = data
        self.datarows = iter(data)
        #TODO: this should be user settable
        dtype = data.dtype
        self.varlist = _default_names(self.nvar)
        self.typlist = [_dtype_to_stata_type(dtype) for i in range(self.nvar)]
        self.fmtlist = [
            _dtype_to_default_stata_fmt(dtype) for i in range(self.nvar)
        ]

    def _prepare_pandas(self, data):
        #NOTE: we might need a different API / class for pandas objects so
        # we can set different semantics - handle this with a PR to pandas.io
        class DataFrameRowIter(object):
            def __init__(self, data):
                self.data = data

            def __iter__(self):
                for i, row in data.iterrows():
                    yield row

        data = data.reset_index()
        self.datarows = DataFrameRowIter(data)
        self.nobs, self.nvar = data.shape
        self.data = data
        self.varlist = data.columns.tolist()
        dtypes = data.dtypes
        convert_dates = self._convert_dates
        if convert_dates is not None:
            convert_dates = _maybe_convert_to_int_keys(convert_dates,
                                                       self.varlist)
            self._convert_dates = convert_dates
            for key in convert_dates:
                new_type = _convert_datetime_to_stata_type(convert_dates[key])
                dtypes[key] = np.dtype(new_type)
        self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes]
        self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes]
        # set the given format for the datetime cols
        if convert_dates is not None:
            for key in convert_dates:
                self.fmtlist[key] = convert_dates[key]

    def write_file(self):
        self._write_header()
        self._write_descriptors()
        self._write_variable_labels()
        # write 5 zeros for expansion fields
        self._write(_pad_bytes("", 5))
        if self._convert_dates is None:
            self._write_data_nodates()
        else:
            self._write_data_dates()
        #self._write_value_labels()

    def _write_header(self, data_label=None, time_stamp=None):
        byteorder = self._byteorder
        # ds_format - just use 114
        self._write(pack("b", 114))
        # byteorder
        self._write(byteorder == ">" and "\x01" or "\x02")
        # filetype
        self._write("\x01")
        # unused
        self._write("\x00")
        # number of vars, 2 bytes
        self._write(pack(byteorder + "h", self.nvar)[:2])
        # number of obs, 4 bytes
        self._write(pack(byteorder + "i", self.nobs)[:4])
        # data label 81 bytes, char, null terminated
        if data_label is None:
            self._write(
                self._null_terminate(_pad_bytes("", 80), self._encoding))
        else:
            self._write(
                self._null_terminate(_pad_bytes(data_label[:80], 80),
                                     self._encoding))
        # time stamp, 18 bytes, char, null terminated
        # format dd Mon yyyy hh:mm
        if time_stamp is None:
            time_stamp = datetime.datetime.now()
        elif not isinstance(time_stamp, datetime):
            raise ValueError("time_stamp should be datetime type")
        self._write(
            self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M"),
                                 self._encoding))

    def _write_descriptors(self,
                           typlist=None,
                           varlist=None,
                           srtlist=None,
                           fmtlist=None,
                           lbllist=None):
        nvar = self.nvar
        # typlist, length nvar, format byte array
        for typ in self.typlist:
            self._write(typ)

        # varlist, length 33*nvar, char array, null terminated
        for name in self.varlist:
            name = self._null_terminate(name, self._encoding)
            name = _pad_bytes(asstr(name[:32]), 33)
            self._write(name)

        # srtlist, 2*(nvar+1), int array, encoded by byteorder
        srtlist = _pad_bytes("", (2 * (nvar + 1)))
        self._write(srtlist)

        # fmtlist, 49*nvar, char array
        for fmt in self.fmtlist:
            self._write(_pad_bytes(fmt, 49))

        # lbllist, 33*nvar, char array
        #NOTE: this is where you could get fancy with pandas categorical type
        for i in range(nvar):
            self._write(_pad_bytes("", 33))

    def _write_variable_labels(self, labels=None):
        nvar = self.nvar
        if labels is None:
            for i in range(nvar):
                self._write(_pad_bytes("", 81))

    def _write_data_nodates(self):
        data = self.datarows
        byteorder = self._byteorder
        TYPE_MAP = self.TYPE_MAP
        typlist = self.typlist
        for row in data:
            #row = row.squeeze().tolist() # needed for structured arrays
            for i, var in enumerate(row):
                typ = ord(typlist[i])
                if typ <= 244:  # we've got a string
                    if len(var) < typ:
                        var = _pad_bytes(asstr(var), len(var) + 1)
                    self._write(var)
                else:
                    try:
                        self._write(pack(byteorder + TYPE_MAP[typ], var))
                    except struct_error:
                        # have to be strict about type pack won't do any
                        # kind of casting
                        self._write(
                            pack(byteorder + TYPE_MAP[typ],
                                 _type_converters[typ](var)))

    def _write_data_dates(self):
        convert_dates = self._convert_dates
        data = self.datarows
        byteorder = self._byteorder
        TYPE_MAP = self.TYPE_MAP
        MISSING_VALUES = self.MISSING_VALUES
        typlist = self.typlist
        for row in data:
            #row = row.squeeze().tolist() # needed for structured arrays
            for i, var in enumerate(row):
                typ = ord(typlist[i])
                #NOTE: If anyone finds this terribly slow, there is
                # a vectorized way to convert dates, see genfromdta for going
                # from int to datetime and reverse it. will copy data though
                if i in convert_dates:
                    var = _datetime_to_stata_elapsed(var, self.fmtlist[i])
                if typ <= 244:  # we've got a string
                    if isnull(var):
                        var = ""  # missing string
                    if len(var) < typ:
                        var = _pad_bytes(var, len(var) + 1)
                    self._write(var)
                else:
                    if isnull(var):  # this only matters for floats
                        var = MISSING_VALUES[typ]
                    self._write(pack(byteorder + TYPE_MAP[typ], var))

    def _null_terminate(self, s, encoding):
        null_byte = '\x00'
        if PY3:
            s += null_byte
            return s.encode(encoding)
        else:
            s += null_byte
            return s
Exemplo n.º 29
0
def plot_partregress(endog,
                     exog_i,
                     exog_others,
                     data=None,
                     title_kwargs={},
                     obs_labels=True,
                     label_kwargs={},
                     ax=None,
                     ret_coords=False,
                     **kwargs):
    """Plot partial regression for a single regressor.

    Parameters
    ----------
    endog : {ndarray, str}
       endogenous or response variable. If string is given, you can use a
       arbitrary translations as with a formula.
    exog_i : {ndarray, str}
        exogenous, explanatory variable. If string is given, you can use a
        arbitrary translations as with a formula.
    exog_others : {ndarray, list[str]}
        other exogenous, explanatory variables. If a list of strings is given,
        each item is a term in formula. You can use a arbitrary translations
        as with a formula. The effect of these variables will be removed by
        OLS regression.
    data : DataFrame, dict, or recarray
        Some kind of data structure with names if the other variables are
        given as strings.
    title_kwargs : dict
        Keyword arguments to pass on for the title. The key to control the
        fonts is fontdict.
    obs_labels : bool or array_like
        Whether or not to annotate the plot points with their observation
        labels. If obs_labels is a boolean, the point labels will try to do
        the right thing. First it will try to use the index of data, then
        fall back to the index of exog_i. Alternatively, you may give an
        array-like object corresponding to the observation numbers.
    labels_kwargs : dict
        Keyword arguments that control annotate for the observation labels.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    ret_coords : bool
        If True will return the coordinates of the points in the plot. You
        can use this to add your own annotations.
    **kwargs
        The keyword arguments passed to plot for the points.

    Returns
    -------
    fig : Figure
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    coords : list, optional
        If ret_coords is True, return a tuple of arrays (x_coords, y_coords).

    Notes
    -----
    The slope of the fitted line is the that of `exog_i` in the full
    multiple regression. The individual points can be used to assess the
    influence of points on the estimated coefficient.

    See Also
    --------
    plot_partregress_grid : Plot partial regression for a set of regressors.

    Examples
    --------
    Load the Statewide Crime data set and plot partial regression of the rate
    of high school graduation (hs_grad) on the murder rate(murder).

    The effects of the percent of the population living in urban areas (urban),
    below the poverty line (poverty) , and in a single person household (single)
    are removed by OLS regression.

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plt

    >>> crime_data = sm.datasets.statecrime.load_pandas()
    >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad',
    ...                              exog_others=['urban', 'poverty', 'single'],
    ...                              data=crime_data.data, obs_labels=False)
    >>> plt.show()

    .. plot:: plots/graphics_regression_partregress.py

    More detailed examples can be found in the Regression Plots notebook
    on the examples page.

    """
    #NOTE: there is no interaction between possible missing data and
    #obs_labels yet, so this will need to be tweaked a bit for this case
    fig, ax = utils.create_mpl_ax(ax)

    # strings, use patsy to transform to data
    if isinstance(endog, str):
        endog = dmatrix(endog + "-1", data)

    if isinstance(exog_others, str):
        RHS = dmatrix(exog_others, data)
    elif isinstance(exog_others, list):
        RHS = "+".join(exog_others)
        RHS = dmatrix(RHS, data)
    else:
        RHS = exog_others
    RHS_isemtpy = False
    if isinstance(RHS, np.ndarray) and RHS.size == 0:
        RHS_isemtpy = True
    elif isinstance(RHS, pd.DataFrame) and RHS.empty:
        RHS_isemtpy = True
    if isinstance(exog_i, str):
        exog_i = dmatrix(exog_i + "-1", data)

    # all arrays or pandas-like

    if RHS_isemtpy:
        ax.plot(endog, exog_i, 'o', **kwargs)
        fitted_line = OLS(endog, exog_i).fit()
        x_axis_endog_name = 'x' if isinstance(exog_i,
                                              np.ndarray) else exog_i.name
        y_axis_endog_name = 'y' if isinstance(
            endog, np.ndarray) else endog.design_info.column_names[0]
    else:
        res_yaxis = OLS(endog, RHS).fit()
        res_xaxis = OLS(exog_i, RHS).fit()
        xaxis_resid = res_xaxis.resid
        yaxis_resid = res_yaxis.resid
        x_axis_endog_name = res_xaxis.model.endog_names
        y_axis_endog_name = res_yaxis.model.endog_names
        ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs)
        fitted_line = OLS(yaxis_resid, xaxis_resid).fit()

    fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax)

    if x_axis_endog_name == 'y':  # for no names regression will just get a y
        x_axis_endog_name = 'x'  # this is misleading, so use x
    ax.set_xlabel("e(%s | X)" % x_axis_endog_name)
    ax.set_ylabel("e(%s | X)" % y_axis_endog_name)
    ax.set_title('Partial Regression Plot', **title_kwargs)

    #NOTE: if we want to get super fancy, we could annotate if a point is
    #clicked using this widget
    #http://stackoverflow.com/questions/4652439/
    #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/
    #4674445#4674445
    if obs_labels is True:
        if data is not None:
            obs_labels = data.index
        elif hasattr(exog_i, "index"):
            obs_labels = exog_i.index
        else:
            obs_labels = res_xaxis.model.data.row_labels
        #NOTE: row_labels can be None.
        #Maybe we should fix this to never be the case.
        if obs_labels is None:
            obs_labels = lrange(len(exog_i))

    if obs_labels is not False:  # could be array_like
        if len(obs_labels) != len(exog_i):
            raise ValueError("obs_labels does not match length of exog_i")
        label_kwargs.update(dict(ha="center", va="bottom"))
        ax = utils.annotate_axes(lrange(len(obs_labels)),
                                 obs_labels,
                                 lzip(res_xaxis.resid, res_yaxis.resid),
                                 [(0, 5)] * len(obs_labels),
                                 "x-large",
                                 ax=ax,
                                 **label_kwargs)

    if ret_coords:
        return fig, (res_xaxis.resid, res_yaxis.resid)
    else:
        return fig
Exemplo n.º 30
0
    print(shannonentropy(Y))

    p = [1e-5, 1e-4, .001, .01, .1, .15, .2, .25, .3, .35, .4, .45, .5]

    plt.subplot(111)
    plt.ylabel("Information")
    plt.xlabel("Probability")
    x = np.linspace(0, 1, 100001)
    plt.plot(x, shannoninfo(x))
    #    plt.show()

    plt.subplot(111)
    plt.ylabel("Entropy")
    plt.xlabel("Probability")
    x = np.linspace(0, 1, 101)
    plt.plot(x, lmap(shannonentropy, lzip(x, 1 - x)))
    #    plt.show()

    # define a joint probability distribution
    # from Golan (2008) table 3.3
    w = np.array([[0, 0, 1. / 3], [1 / 9., 1 / 9., 1 / 9.],
                  [1 / 18., 1 / 9., 1 / 6.]])
    # table 3.4
    px = w.sum(0)
    py = w.sum(1)
    H_X = shannonentropy(px)
    H_Y = shannonentropy(py)
    H_XY = shannonentropy(w)
    H_XgivenY = condentropy(px, py, w)
    H_YgivenX = condentropy(py, px, w)
    # note that cross-entropy is not a distance measure as the following shows
Exemplo n.º 31
0
def add_lag(x, col=None, lags=1, drop=False, insert=True):
    """
    Returns an array with lags included given an array.

    Parameters
    ----------
    x : array
        An array or NumPy ndarray subclass. Can be either a 1d or 2d array with
        observations in columns.
    col : 'string', int, or None
        If data is a structured array or a recarray, `col` can be a string
        that is the name of the column containing the variable. Or `col` can
        be an int of the zero-based column index. If it's a 1d array `col`
        can be None.
    lags : int
        The number of lags desired.
    drop : bool
        Whether to keep the contemporaneous variable for the data.
    insert : bool or int
        If True, inserts the lagged values after `col`. If False, appends
        the data. If int inserts the lags at int.

    Returns
    -------
    array : ndarray
        Array with lags

    Examples
    --------

    >>> import statsmodels.api as sm
    >>> data = sm.datasets.macrodata.load(as_pandas=False)
    >>> data = data.data[['year','quarter','realgdp','cpi']]
    >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2)

    Notes
    -----
    Trims the array both forward and backward, so that the array returned
    so that the length of the returned array is len(`X`) - lags. The lags are
    returned in increasing order, ie., t-1,t-2,...,t-lags
    """
    if x.dtype.names:
        names = x.dtype.names
        if not col and np.squeeze(x).ndim > 1:
            raise IndexError("col is None and the input array is not 1d")
        elif len(names) == 1:
            col = names[0]
        if isinstance(col, (int, long)):
            col = x.dtype.names[col]
        if not PY3:
            # TODO: Get rid of this kludge.  See GH # 3658
            names = [bytes(name)
                     if isinstance(name, unicode)  # noqa:F821
                     else name for name in names]
            # Fail loudly if there is a non-ascii name.
            x.dtype.names = names
            if isinstance(col, unicode):  # noqa:F821
                col = bytes(col)

        contemp = x[col]

        # make names for lags
        tmp_names = [col + '_'+'L(%i)' % i for i in range(1, lags+1)]
        ndlags = lagmat(contemp, maxlag=lags, trim='Both')

        # get index for return
        if insert is True:
            ins_idx = list(names).index(col) + 1
        elif insert is False:
            ins_idx = len(names) + 1
        else: # insert is an int
            if insert > len(names):
                import warnings
                warnings.warn("insert > number of variables, inserting at the"
                              " last position", ValueWarning)
            ins_idx = insert

        first_names = list(names[:ins_idx])
        last_names = list(names[ins_idx:])

        if drop:
            if col in first_names:
                first_names.pop(first_names.index(col))
            else:
                last_names.pop(last_names.index(col))

        if first_names: # only do this if x isn't "empty"
            # Workaround to avoid NumPy FutureWarning
            _x = recarray_select(x, first_names)
            first_arr = nprf.append_fields(_x[lags:], tmp_names, ndlags.T,
                                           usemask=False)

        else:
            first_arr = np.zeros(len(x)-lags, dtype=lzip(tmp_names,
                (x[col].dtype,)*lags))
            for i,name in enumerate(tmp_names):
                first_arr[name] = ndlags[:,i]
        if last_names:
            return nprf.append_fields(first_arr, last_names,
                    [x[name][lags:] for name in last_names], usemask=False)
        else: # lags for last variable
            return first_arr

    else: # we have an ndarray

        if x.ndim == 1: # make 2d if 1d
            x = x[:,None]
        if col is None:
            col = 0

        # handle negative index
        if col < 0:
            col = x.shape[1] + col

        contemp = x[:,col]

        if insert is True:
            ins_idx = col + 1
        elif insert is False:
            ins_idx = x.shape[1]
        else:
            if insert < 0: # handle negative index
                insert = x.shape[1] + insert + 1
            if insert > x.shape[1]:
                insert = x.shape[1]
                import warnings
                warnings.warn("insert > number of variables, inserting at the"
                              " last position", ValueWarning)
            ins_idx = insert

        ndlags = lagmat(contemp, lags, trim='Both')
        first_cols = lrange(ins_idx)
        last_cols = lrange(ins_idx,x.shape[1])
        if drop:
            if col in first_cols:
                first_cols.pop(first_cols.index(col))
            else:
                last_cols.pop(last_cols.index(col))
        return np.column_stack((x[lags:,first_cols],ndlags,
                    x[lags:,last_cols]))
Exemplo n.º 32
0
def add_lag(x, col=None, lags=1, drop=False, insert=True):
    """
    Returns an array with lags included given an array.

    Parameters
    ----------
    x : array
        An array or NumPy ndarray subclass. Can be either a 1d or 2d array with
        observations in columns.
    col : 'string', int, or None
        If data is a structured array or a recarray, `col` can be a string
        that is the name of the column containing the variable. Or `col` can
        be an int of the zero-based column index. If it's a 1d array `col`
        can be None.
    lags : int
        The number of lags desired.
    drop : bool
        Whether to keep the contemporaneous variable for the data.
    insert : bool or int
        If True, inserts the lagged values after `col`. If False, appends
        the data. If int inserts the lags at int.

    Returns
    -------
    array : ndarray
        Array with lags

    Examples
    --------

    >>> import statsmodels.api as sm
    >>> data = sm.datasets.macrodata.load()
    >>> data = data.data[['year','quarter','realgdp','cpi']]
    >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2)

    Notes
    -----
    Trims the array both forward and backward, so that the array returned
    so that the length of the returned array is len(`X`) - lags. The lags are
    returned in increasing order, ie., t-1,t-2,...,t-lags
    """
    if x.dtype.names:
        names = x.dtype.names
        if not col and np.squeeze(x).ndim > 1:
            raise IndexError("col is None and the input array is not 1d")
        elif len(names) == 1:
            col = names[0]
        if isinstance(col, int):
            col = x.dtype.names[col]
        contemp = x[col]

        # make names for lags
        tmp_names = [col + '_'+'L(%i)' % i for i in range(1,lags+1)]
        ndlags = lagmat(contemp, maxlag=lags, trim='Both')

        # get index for return
        if insert is True:
            ins_idx = list(names).index(col) + 1
        elif insert is False:
            ins_idx = len(names) + 1
        else: # insert is an int
            if insert > len(names):
                import warnings
                warnings.warn("insert > number of variables, inserting at the"
                              " last position",
                              UserWarning)
            ins_idx = insert

        first_names = list(names[:ins_idx])
        last_names = list(names[ins_idx:])

        if drop:
            if col in first_names:
                first_names.pop(first_names.index(col))
            else:
                last_names.pop(last_names.index(col))

        if first_names: # only do this if x isn't "empty"
            first_arr = nprf.append_fields(x[first_names][lags:],tmp_names,
                        ndlags.T, usemask=False)
        else:
            first_arr = np.zeros(len(x)-lags, dtype=lzip(tmp_names,
                (x[col].dtype,)*lags))
            for i,name in enumerate(tmp_names):
                first_arr[name] = ndlags[:,i]
        if last_names:
            return nprf.append_fields(first_arr, last_names,
                    [x[name][lags:] for name in last_names], usemask=False)
        else: # lags for last variable
            return first_arr

    else: # we have an ndarray

        if x.ndim == 1: # make 2d if 1d
            x = x[:,None]
        if col is None:
            col = 0

        # handle negative index
        if col < 0:
            col = x.shape[1] + col

        contemp = x[:,col]

        if insert is True:
            ins_idx = col + 1
        elif insert is False:
            ins_idx = x.shape[1]
        else:
            if insert < 0: # handle negative index
                insert = x.shape[1] + insert + 1
            if insert > x.shape[1]:
                insert = x.shape[1]
                import warnings
                warnings.warn("insert > number of variables, inserting at the"
                              " last position",
                              UserWarning)
            ins_idx = insert

        ndlags = lagmat(contemp, lags, trim='Both')
        first_cols = lrange(ins_idx)
        last_cols = lrange(ins_idx,x.shape[1])
        if drop:
            if col in first_cols:
                first_cols.pop(first_cols.index(col))
            else:
                last_cols.pop(last_cols.index(col))
        return np.column_stack((x[lags:,first_cols],ndlags,
                    x[lags:,last_cols]))
Exemplo n.º 33
0
def plot_partregress(endog, exog_i, exog_others, data=None,
                     title_kwargs={}, obs_labels=True, label_kwargs={},
                     ax=None, ret_coords=False, **kwargs):
    """Plot partial regression for a single regressor.

    Parameters
    ----------
    endog : ndarray or string
       endogenous or response variable. If string is given, you can use a
       arbitrary translations as with a formula.
    exog_i : ndarray or string
        exogenous, explanatory variable. If string is given, you can use a
        arbitrary translations as with a formula.
    exog_others : ndarray or list of strings
        other exogenous, explanatory variables. If a list of strings is given,
        each item is a term in formula. You can use a arbitrary translations
        as with a formula. The effect of these variables will be removed by
        OLS regression.
    data : DataFrame, dict, or recarray
        Some kind of data structure with names if the other variables are
        given as strings.
    title_kwargs : dict
        Keyword arguments to pass on for the title. The key to control the
        fonts is fontdict.
    obs_labels : bool or array-like
        Whether or not to annotate the plot points with their observation
        labels. If obs_labels is a boolean, the point labels will try to do
        the right thing. First it will try to use the index of data, then
        fall back to the index of exog_i. Alternatively, you may give an
        array-like object corresponding to the obseveration numbers.
    labels_kwargs : dict
        Keyword arguments that control annotate for the observation labels.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    ret_coords : bool
        If True will return the coordinates of the points in the plot. You
        can use this to add your own annotations.
    kwargs
        The keyword arguments passed to plot for the points.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    coords : list, optional
        If ret_coords is True, return a tuple of arrays (x_coords, y_coords).

    Notes
    -----
    The slope of the fitted line is the that of `exog_i` in the full
    multiple regression. The individual points can be used to assess the
    influence of points on the estimated coefficient.

    See Also
    --------
    plot_partregress_grid : Plot partial regression for a set of regressors.
    """
    #NOTE: there is no interaction between possible missing data and
    #obs_labels yet, so this will need to be tweaked a bit for this case
    fig, ax = utils.create_mpl_ax(ax)

    # strings, use patsy to transform to data
    if isinstance(endog, string_types):
        endog = dmatrix(endog + "-1", data)

    if isinstance(exog_others, string_types):
        RHS = dmatrix(exog_others, data)
    elif isinstance(exog_others, list):
        RHS = "+".join(exog_others)
        RHS = dmatrix(RHS, data)
    else:
        RHS = exog_others
    RHS_isemtpy = False
    if isinstance(RHS, np.ndarray) and RHS.size==0:
        RHS_isemtpy = True
    elif isinstance(RHS, pd.DataFrame) and RHS.empty:
        RHS_isemtpy = True
    if isinstance(exog_i, string_types):
        exog_i = dmatrix(exog_i + "-1", data)

    # all arrays or pandas-like

    if RHS_isemtpy:
        ax.plot(endog, exog_i, 'o', **kwargs)
        fitted_line = OLS(endog, exog_i).fit()
        x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name
        y_axis_endog_name = 'y' if isinstance(endog, np.ndarray) else endog.design_info.column_names[0]
    else:
        res_yaxis = OLS(endog, RHS).fit()
        res_xaxis = OLS(exog_i, RHS).fit()
        xaxis_resid = res_xaxis.resid
        yaxis_resid = res_yaxis.resid
        x_axis_endog_name = res_xaxis.model.endog_names
        y_axis_endog_name = res_yaxis.model.endog_names
        ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs)
        fitted_line = OLS(yaxis_resid, xaxis_resid).fit()

    fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax)

    if x_axis_endog_name == 'y':  # for no names regression will just get a y
        x_axis_endog_name = 'x'  # this is misleading, so use x
    ax.set_xlabel("e(%s | X)" % x_axis_endog_name)
    ax.set_ylabel("e(%s | X)" % y_axis_endog_name)
    ax.set_title('Partial Regression Plot', **title_kwargs)

    #NOTE: if we want to get super fancy, we could annotate if a point is
    #clicked using this widget
    #http://stackoverflow.com/questions/4652439/
    #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/
    #4674445#4674445
    if obs_labels is True:
        if data is not None:
            obs_labels = data.index
        elif hasattr(exog_i, "index"):
            obs_labels = exog_i.index
        else:
            obs_labels = res_xaxis.model.data.row_labels
        #NOTE: row_labels can be None.
        #Maybe we should fix this to never be the case.
        if obs_labels is None:
            obs_labels = lrange(len(exog_i))

    if obs_labels is not False:  # could be array-like
        if len(obs_labels) != len(exog_i):
            raise ValueError("obs_labels does not match length of exog_i")
        label_kwargs.update(dict(ha="center", va="bottom"))
        ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels,
                                 lzip(res_xaxis.resid, res_yaxis.resid),
                                 [(0, 5)] * len(obs_labels), "x-large", ax=ax,
                                 **label_kwargs)

    if ret_coords:
        return fig, (res_xaxis.resid, res_yaxis.resid)
    else:
        return fig
Exemplo n.º 34
0
def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds):
    """create new results instance with robust covariance as default

    Parameters
    ----------
    cov_type : str
        the type of robust sandwich estimator to use. see Notes below
    use_t : bool
        If true, then the t distribution is used for inference.
        If false, then the normal distribution is used.
    kwds : depends on cov_type
        Required or optional arguments for robust covariance calculation.
        see Notes below

    Returns
    -------
    results : results instance
        This method creates a new results instance with the requested
        robust covariance as the default covariance of the parameters.
        Inferential statistics like p-values and hypothesis tests will be
        based on this covariance matrix.

    Notes
    -----
    Warning: Some of the options and defaults in cov_kwds may be changed in a
    future version.

    The covariance keywords provide an option 'scaling_factor' to adjust the
    scaling of the covariance matrix, that is the covariance is multiplied by
    this factor if it is given and is not `None`. This allows the user to
    adjust the scaling of the covariance matrix to match other statistical
    packages.
    For example, `scaling_factor=(nobs - 1.) / (nobs - k_params)` provides a
    correction so that the robust covariance matrices match those of Stata in
    some models like GLM and discrete Models.

    The following covariance types and required or optional arguments are
    currently available:

    - 'HC0', 'HC1', 'HC2', 'HC3': heteroscedasticity robust covariance

      - no keyword arguments

    - 'HAC': heteroskedasticity-autocorrelation robust covariance

      ``maxlags`` :  integer, required
        number of lags to use

      ``kernel`` : {callable, str}, optional
        kernels currently available kernels are ['bartlett', 'uniform'],
        default is Bartlett

      ``use_correction``: bool, optional
        If true, use small sample correction

    - 'cluster': clustered covariance estimator

      ``groups`` : array_like[int], required :
        Integer-valued index of clusters or groups.

      ``use_correction``: bool, optional
        If True the sandwich covariance is calculated with a small
        sample correction.
        If False the sandwich covariance is calculated without
        small sample correction.

      ``df_correction``: bool, optional
        If True (default), then the degrees of freedom for the
        inferential statistics and hypothesis tests, such as
        pvalues, f_pvalue, conf_int, and t_test and f_test, are
        based on the number of groups minus one instead of the
        total number of observations minus the number of explanatory
        variables. `df_resid` of the results instance is also
        adjusted. When `use_t` is also True, then pvalues are
        computed using the Student's t distribution using the
        corrected values. These may differ substantially from
        p-values based on the normal is the number of groups is
        small.
        If False, then `df_resid` of the results instance is not
        adjusted.


    - 'hac-groupsum': Driscoll and Kraay, heteroscedasticity and
      autocorrelation robust covariance for panel data
      # TODO: more options needed here

      ``time`` : array_like, required
        index of time periods
      ``maxlags`` : integer, required
        number of lags to use
      ``kernel`` : {callable, str}, optional
        The available kernels are ['bartlett', 'uniform']. The default is
        Bartlett.
      ``use_correction`` : {False, 'hac', 'cluster'}, optional
        If False the the sandwich covariance is calculated without small
        sample correction. If `use_correction = 'cluster'` (default),
        then the same small sample correction as in the case of
        `covtype='cluster'` is used.
      ``df_correction`` : bool, optional
        The adjustment to df_resid, see cov_type 'cluster' above

    - 'hac-panel': heteroscedasticity and autocorrelation robust standard
      errors in panel data. The data needs to be sorted in this case, the
      time series for each panel unit or cluster need to be stacked. The
      membership to a time series of an individual or group can be either
      specified by group indicators or by increasing time periods. One of
      ``groups`` or ``time`` is required. # TODO: we need more options here

      ``groups`` : array_like[int]
        indicator for groups
      ``time`` : array_like[int]
        index of time periods
      ``maxlags`` : int, required
        number of lags to use
      ``kernel`` : {callable, str}, optional
        Available kernels are ['bartlett', 'uniform'], default
        is Bartlett
      ``use_correction`` : {False, 'hac', 'cluster'}, optional
        If False the sandwich covariance is calculated without
        small sample correction.
      ``df_correction`` : bool, optional
        Adjustment to df_resid, see cov_type 'cluster' above

    **Reminder**: ``use_correction`` in "hac-groupsum" and "hac-panel" is
    not bool, needs to be in {False, 'hac', 'cluster'}.

    .. todo:: Currently there is no check for extra or misspelled keywords,
         except in the case of cov_type `HCx`
    """

    import statsmodels.stats.sandwich_covariance as sw

    cov_type = normalize_cov_type(cov_type)

    if 'kernel' in kwds:
        kwds['weights_func'] = kwds.pop('kernel')
    if 'weights_func' in kwds and not callable(kwds['weights_func']):
        kwds['weights_func'] = sw.kernel_dict[kwds['weights_func']]

    # pop because HCx raises if any kwds
    sc_factor = kwds.pop('scaling_factor', None)

    # TODO: make separate function that returns a robust cov plus info
    use_self = kwds.pop('use_self', False)
    if use_self:
        res = self
    else:
        # this does not work for most models, use raw instance instead from fit
        res = self.__class__(self.model,
                             self.params,
                             normalized_cov_params=self.normalized_cov_params,
                             scale=self.scale)

    res.cov_type = cov_type
    # use_t might already be defined by the class, and already set
    if use_t is None:
        use_t = self.use_t
    res.cov_kwds = {'use_t': use_t}  # store for information
    res.use_t = use_t

    adjust_df = False
    if cov_type in ['cluster', 'hac-panel', 'hac-groupsum']:
        df_correction = kwds.get('df_correction', None)
        # TODO: check also use_correction, do I need all combinations?
        if df_correction is not False:  # i.e. in [None, True]:
            # user did not explicitely set it to False
            adjust_df = True

    res.cov_kwds['adjust_df'] = adjust_df

    # verify and set kwds, and calculate cov
    # TODO: this should be outsourced in a function so we can reuse it in
    #       other models
    # TODO: make it DRYer   repeated code for checking kwds
    if cov_type.upper() in ('HC0', 'HC1', 'HC2', 'HC3'):
        if kwds:
            raise ValueError('heteroscedasticity robust covariance '
                             'does not use keywords')
        res.cov_kwds['description'] = descriptions[cov_type.upper()]

        res.cov_params_default = getattr(self, 'cov_' + cov_type.upper(), None)
        if res.cov_params_default is None:
            # results classes that do not have cov_HCx attribute
            res.cov_params_default = sw.cov_white_simple(self,
                                                         use_correction=False)
    elif cov_type.lower() == 'hac':
        maxlags = kwds['maxlags']  # required?, default in cov_hac_simple
        res.cov_kwds['maxlags'] = maxlags
        weights_func = kwds.get('weights_func', sw.weights_bartlett)
        res.cov_kwds['weights_func'] = weights_func
        use_correction = kwds.get('use_correction', False)
        res.cov_kwds['use_correction'] = use_correction
        res.cov_kwds['description'] = descriptions['HAC'].format(
            maxlags=maxlags, correction=['without', 'with'][use_correction])

        res.cov_params_default = sw.cov_hac_simple(
            self,
            nlags=maxlags,
            weights_func=weights_func,
            use_correction=use_correction)
    elif cov_type.lower() == 'cluster':
        #cluster robust standard errors, one- or two-way
        groups = kwds['groups']
        if not hasattr(groups, 'shape'):
            groups = np.asarray(groups).T

        if groups.ndim >= 2:
            groups = groups.squeeze()

        res.cov_kwds['groups'] = groups
        use_correction = kwds.get('use_correction', True)
        res.cov_kwds['use_correction'] = use_correction
        if groups.ndim == 1:
            if adjust_df:
                # need to find number of groups
                # duplicate work
                self.n_groups = n_groups = len(np.unique(groups))
            res.cov_params_default = sw.cov_cluster(
                self, groups, use_correction=use_correction)

        elif groups.ndim == 2:
            if hasattr(groups, 'values'):
                groups = groups.values

            if adjust_df:
                # need to find number of groups
                # duplicate work
                n_groups0 = len(np.unique(groups[:, 0]))
                n_groups1 = len(np.unique(groups[:, 1]))
                self.n_groups = (n_groups0, n_groups1)
                n_groups = min(n_groups0, n_groups1)  # use for adjust_df

            # Note: sw.cov_cluster_2groups has 3 returns
            res.cov_params_default = sw.cov_cluster_2groups(
                self, groups, use_correction=use_correction)[0]
        else:
            raise ValueError('only two groups are supported')
        res.cov_kwds['description'] = descriptions['cluster']

    elif cov_type.lower() == 'hac-panel':
        #cluster robust standard errors
        res.cov_kwds['time'] = time = kwds.get('time', None)
        res.cov_kwds['groups'] = groups = kwds.get('groups', None)
        #TODO: nlags is currently required
        #nlags = kwds.get('nlags', True)
        #res.cov_kwds['nlags'] = nlags
        #TODO: `nlags` or `maxlags`
        res.cov_kwds['maxlags'] = maxlags = kwds['maxlags']
        use_correction = kwds.get('use_correction', 'hac')
        res.cov_kwds['use_correction'] = use_correction
        weights_func = kwds.get('weights_func', sw.weights_bartlett)
        res.cov_kwds['weights_func'] = weights_func
        # TODO: clumsy time index in cov_nw_panel
        if groups is not None:
            groups = np.asarray(groups)
            tt = (np.nonzero(groups[:-1] != groups[1:])[0] + 1).tolist()
            nobs_ = len(groups)
        elif time is not None:
            # TODO: clumsy time index in cov_nw_panel
            time = np.asarray(time)
            tt = (np.nonzero(time[1:] < time[:-1])[0] + 1).tolist()
            nobs_ = len(time)
        else:
            raise ValueError('either time or groups needs to be given')
        groupidx = lzip([0] + tt, tt + [nobs_])
        self.n_groups = n_groups = len(groupidx)
        res.cov_params_default = sw.cov_nw_panel(self,
                                                 maxlags,
                                                 groupidx,
                                                 weights_func=weights_func,
                                                 use_correction=use_correction)
        res.cov_kwds['description'] = descriptions['HAC-Panel']

    elif cov_type.lower() == 'hac-groupsum':
        # Driscoll-Kraay standard errors
        res.cov_kwds['time'] = time = kwds['time']
        #TODO: nlags is currently required
        #nlags = kwds.get('nlags', True)
        #res.cov_kwds['nlags'] = nlags
        #TODO: `nlags` or `maxlags`
        res.cov_kwds['maxlags'] = maxlags = kwds['maxlags']
        use_correction = kwds.get('use_correction', 'cluster')
        res.cov_kwds['use_correction'] = use_correction
        weights_func = kwds.get('weights_func', sw.weights_bartlett)
        res.cov_kwds['weights_func'] = weights_func
        if adjust_df:
            # need to find number of groups
            tt = (np.nonzero(time[1:] < time[:-1])[0] + 1)
            self.n_groups = n_groups = len(tt) + 1
        res.cov_params_default = sw.cov_nw_groupsum(
            self,
            maxlags,
            time,
            weights_func=weights_func,
            use_correction=use_correction)
        res.cov_kwds['description'] = descriptions['HAC-Groupsum']
    else:
        raise ValueError('cov_type not recognized. See docstring for ' +
                         'available options and spelling')

    # generic optional factor to scale covariance

    res.cov_kwds['scaling_factor'] = sc_factor
    if sc_factor is not None:
        res.cov_params_default *= sc_factor

    if adjust_df:
        # Note: df_resid is used for scale and others, add new attribute
        res.df_resid_inference = n_groups - 1

    return res
Exemplo n.º 35
0
def summary_params_2d(result,
                      extras=None,
                      endog_names=None,
                      exog_names=None,
                      title=None):
    '''create summary table of regression parameters with several equations

    This allows interleaving of parameters with bse and/or tvalues

    Parameters
    ----------
    result : result instance
        the result instance with params and attributes in extras
    extras : list of strings
        additional attributes to add below a parameter row, e.g. bse or tvalues
    endog_names : None or list of strings
        names for rows of the parameter array (multivariate endog)
    exog_names : None or list of strings
        names for columns of the parameter array (exog)
    alpha : float
        level for confidence intervals, default 0.95
    title : None or string

    Returns
    -------
    tables : list of SimpleTable
        this contains a list of all seperate Subtables
    table_all : SimpleTable
        the merged table with results concatenated for each row of the parameter
        array

    '''
    if endog_names is None:
        # TODO: note the [1:] is specific to current MNLogit
        endog_names = [
            'endog_%d' % i for i in np.unique(result.model.endog)[1:]
        ]
    if exog_names is None:
        exog_names = ['var%d' % i for i in range(len(result.params))]

    # TODO: check formatting options with different values
    res_params = [[forg(item, prec=4) for item in row]
                  for row in result.params]
    if extras:
        extras_list = [[[
            '%10s' % ('(' + forg(v, prec=3).strip() + ')') for v in col
        ] for col in getattr(result, what)] for what in extras]
        data = lzip(res_params, *extras_list)
        data = [i for j in data for i in j]  #flatten
        stubs = lzip(endog_names, *[[''] * len(endog_names)] * len(extras))
        stubs = [i for j in stubs for i in j]  #flatten
    else:
        data = res_params
        stubs = endog_names

    txt_fmt = copy.deepcopy(fmt_params)
    txt_fmt["data_fmts"] = ["%s"] * result.params.shape[1]

    return SimpleTable(data,
                       headers=exog_names,
                       stubs=stubs,
                       title=title,
                       txt_fmt=txt_fmt)
Exemplo n.º 36
0
    kern = kernels.Gaussian(h = h)
    kde = KDE( x, kern)
    print(kde.density(1.015469))
    print(0.2034675)
    Xs = np.arange(-10,10,0.1)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(Xs, kde(Xs), "-")
    ax.set_ylim(-10, 10)
    ax.set_ylim(0,0.4)


    # 2-D case
    x = lzip(kdetest.faithfulData["eruptions"], kdetest.faithfulData["waiting"])
    x = np.array(x)
    x = (x - x.mean(0))/x.std(0)
    nobs = x.shape[0]
    H = kdetest.Hpi
    kern = kernels.NdKernel( 2 )
    kde = KDE( x, kern )
    print(kde.density( np.matrix( [1,2 ]))) #.T
    plt.figure()
    plt.plot(x[:,0], x[:,1], 'o')


    n_grid = 50
    xsp = np.linspace(x.min(0)[0], x.max(0)[0], n_grid)
    ysp = np.linspace(x.min(0)[1], x.max(0)[1], n_grid)
#    xsorted = np.sort(x)
Exemplo n.º 37
0
def summary(self,
            yname=None,
            xname=None,
            title=0,
            alpha=.05,
            returns='text',
            model_info=None):
    """
    Parameters
    -----------
    yname : string
            optional, Default is `Y`
    xname : list of strings
            optional, Default is `X.#` for # in p the number of regressors
    Confidance interval : (0,1) not implimented
    title : string
            optional, Defualt is 'Generalized linear model'
    returns : string
              'text', 'table', 'csv', 'latex', 'html'

    Returns
    -------
    Default :
    returns='print'
            Prints the summarirized results

    Option :
    returns='text'
            Prints the summarirized results

    Option :
    returns='table'
             SimpleTable instance : summarizing the fit of a linear model.

    Option :
    returns='csv'
            returns a string of csv of the results, to import into a spreadsheet

    Option :
    returns='latex'
    Not implimented yet

    Option :
    returns='HTML'
    Not implimented yet


    Examples (needs updating)
    --------
    >>> import statsmodels as sm
    >>> data = sm.datasets.longley.load(as_pandas=False)
    >>> data.exog = sm.add_constant(data.exog)
    >>> ols_results = sm.OLS(data.endog, data.exog).results
    >>> print ols_results.summary()
    ...

    Notes
    -----
    conf_int calculated from normal dist.
    """

    # TODO: Make sure all self.model.__class__.__name__ are listed
    model_types = {
        'OLS': 'Ordinary least squares',
        'GLS': 'Generalized least squares',
        'GLSAR': 'Generalized least squares with AR(p)',
        'WLS': 'Weighted least squares',
        'RLM': 'Robust linear model',
        'GLM': 'Generalized linear model'
    }
    if title == 0:
        title = model_types[self.model.__class__.__name__]

    yname, xname = _getnames(self, yname, xname)

    time_now = time.localtime()
    time_of_day = [time.strftime("%H:%M:%S", time_now)]
    date = time.strftime("%a, %d %b %Y", time_now)
    modeltype = self.model.__class__.__name__
    nobs = self.nobs
    df_model = self.df_model
    df_resid = self.df_resid

    #General part of the summary table, Applicable to all? models
    #------------------------------------------------------------
    # TODO: define this generically, overwrite in model classes
    #replace definition of stubs data by single list
    #e.g.
    gen_left = [
        ('Model type:', [modeltype]),
        ('Date:', [date]),
        ('Dependent Variable:',
         yname),  # TODO: What happens with multiple names?
        ('df model', [df_model])
    ]
    gen_stubs_left, gen_data_left = zip_longest(*gen_left)  #transpose row col

    gen_title = title
    gen_header = None
    gen_table_left = SimpleTable(gen_data_left,
                                 gen_header,
                                 gen_stubs_left,
                                 title=gen_title,
                                 txt_fmt=gen_fmt)

    gen_stubs_right = ('Method:', 'Time:', 'Number of Obs:', 'df resid')
    gen_data_right = (
        [modeltype],  #was dist family need to look at more
        time_of_day,
        [nobs],
        [df_resid])
    gen_table_right = SimpleTable(gen_data_right,
                                  gen_header,
                                  gen_stubs_right,
                                  title=gen_title,
                                  txt_fmt=gen_fmt)
    gen_table_left.extend_right(gen_table_right)
    general_table = gen_table_left

    # Parameters part of the summary table
    # ------------------------------------
    # Note: this is not necessary since we standardized names,
    #  only t versus normal
    tstats = {
        'OLS': self.t(),
        'GLS': self.t(),
        'GLSAR': self.t(),
        'WLS': self.t(),
        'RLM': self.t(),
        'GLM': self.t()
    }
    prob_stats = {
        'OLS': self.pvalues,
        'GLS': self.pvalues,
        'GLSAR': self.pvalues,
        'WLS': self.pvalues,
        'RLM': self.pvalues,
        'GLM': self.pvalues
    }
    # Dictionary to store the header names for the parameter part of the
    # summary table. look up by modeltype
    alp = str((1 - alpha) * 100) + '%'
    param_header = {
        'OLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
        'GLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
        'GLSAR': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
        'WLS': ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'],
        'GLM': ['coef', 'std err', 't', 'P>|t|',
                alp + ' Conf. Interval'],  #glm uses t-distribution
        'RLM': ['coef', 'std err', 'z', 'P>|z|',
                alp + ' Conf. Interval']  #checke z
    }
    params_stubs = xname
    params = self.params
    conf_int = self.conf_int(alpha)
    std_err = self.bse
    exog_len = lrange(len(xname))
    tstat = tstats[modeltype]
    prob_stat = prob_stats[modeltype]

    # Simpletable should be able to handle the formating
    params_data = lzip(["%#6.4g" % (params[i]) for i in exog_len],
                       ["%#6.4f" % (std_err[i]) for i in exog_len],
                       ["%#6.4f" % (tstat[i]) for i in exog_len],
                       ["%#6.4f" % (prob_stat[i]) for i in exog_len],
                       ["(%#5g, %#5g)" % tuple(conf_int[i]) for i in exog_len])
    parameter_table = SimpleTable(params_data,
                                  param_header[modeltype],
                                  params_stubs,
                                  title=None,
                                  txt_fmt=fmt_2)

    #special table
    #-------------
    #TODO: exists in linear_model, what about other models
    #residual diagnostics

    #output options
    #--------------
    #TODO: JP the rest needs to be fixed, similar to summary in linear_model

    def ols_printer():
        """
        print summary table for ols models
        """
        table = str(general_table) + '\n' + str(parameter_table)
        return table

    def glm_printer():
        table = str(general_table) + '\n' + str(parameter_table)
        return table

    printers = {'OLS': ols_printer, 'GLM': glm_printer}

    if returns == 'print':
        try:
            return printers[modeltype]()
        except KeyError:
            return printers['OLS']()
Exemplo n.º 38
0
    print(shannonentropy(Y))

    p = [1e-5,1e-4,.001,.01,.1,.15,.2,.25,.3,.35,.4,.45,.5]

    plt.subplot(111)
    plt.ylabel("Information")
    plt.xlabel("Probability")
    x = np.linspace(0,1,100001)
    plt.plot(x, shannoninfo(x))
#    plt.show()

    plt.subplot(111)
    plt.ylabel("Entropy")
    plt.xlabel("Probability")
    x = np.linspace(0,1,101)
    plt.plot(x, lmap(shannonentropy, lzip(x,1-x)))
#    plt.show()

    # define a joint probability distribution
    # from Golan (2008) table 3.3
    w = np.array([[0,0,1./3],[1/9.,1/9.,1/9.],[1/18.,1/9.,1/6.]])
    # table 3.4
    px = w.sum(0)
    py = w.sum(1)
    H_X = shannonentropy(px)
    H_Y = shannonentropy(py)
    H_XY = shannonentropy(w)
    H_XgivenY = condentropy(px,py,w)
    H_YgivenX = condentropy(py,px,w)
# note that cross-entropy is not a distance measure as the following shows
    D_YX = logbasechange(2,np.e)*stats.entropy(px, py)
Exemplo n.º 39
0
    xihistory = np.zeros((2,nobs))
    for i in range(1,nobs):
        xihistory[:,i] = np.dot(F,xihistory[:,i-1]) + \
                np.dot(cholQ,np.random.randn(2,1)).squeeze()
                # this makes an ARMA process?
                # check notes, do the math
    y = np.dot(H.T, xihistory)
    y = y.T

    params = np.array([rho, sigma1, sigma2])
    penalty = 1e5
    upperbounds = np.array([.999, 100, 100])
    lowerbounds = np.array([-.999, .001, .001])
    xi10 = xihistory[:,0]
    ntrain = 1
    bounds = lzip(lowerbounds,upperbounds) # if you use fmin_l_bfgs_b
#    results = optimize.fmin_bfgs(updatematrices, params,
#        args=(y,xi10,ntrain,penalty,upperbounds,lowerbounds),
#        gtol = 1e-8, epsilon=1e-10)
#    array([ 0.83111567,  1.2695249 ,  0.61436685])

    F = lambda x : np.array([[x[0],0],[0,0]])

    def Q(x):
        cholQ = np.array([[x[1],0],[0,x[2]]])
        return np.dot(cholQ,cholQ.T)

    H = np.ones((2,1))
#    ssm_model = StateSpaceModel(y)  # need to pass in Xi10!
#    ssm_model.fit_kalman(start_params=params, xi10=xi10, F=F, Q=Q, H=H,
#            upperbounds=upperbounds, lowerbounds=lowerbounds)
Exemplo n.º 40
0
def categorical(data, col=None, dictnames=False, drop=False, ):
    '''
    Returns a dummy matrix given an array of categorical variables.

    Parameters
    ----------
    data : array
        A structured array, recarray, or array.  This can be either
        a 1d vector of the categorical variable or a 2d array with
        the column specifying the categorical variable specified by the col
        argument.
    col : 'string', int, or None
        If data is a structured array or a recarray, `col` can be a string
        that is the name of the column that contains the variable.  For all
        arrays `col` can be an int that is the (zero-based) column index
        number.  `col` can only be None for a 1d array.  The default is None.
    dictnames : bool, optional
        If True, a dictionary mapping the column number to the categorical
        name is returned.  Used to have information about plain arrays.
    drop : bool
        Whether or not keep the categorical variable in the returned matrix.

    Returns
    --------
    dummy_matrix, [dictnames, optional]
        A matrix of dummy (indicator/binary) float variables for the
        categorical data.  If dictnames is True, then the dictionary
        is returned as well.

    Notes
    -----
    This returns a dummy variable for EVERY distinct variable.  If a
    a structured or recarray is provided, the names for the new variable is the
    old variable name - underscore - category name.  So if the a variable
    'vote' had answers as 'yes' or 'no' then the returned array would have to
    new variables-- 'vote_yes' and 'vote_no'.  There is currently
    no name checking.

    Examples
    --------
    >>> import numpy as np
    >>> import statsmodels.api as sm

    Univariate examples

    >>> import string
    >>> string_var = [string.lowercase[0:5], string.lowercase[5:10],   \
                string.lowercase[10:15], string.lowercase[15:20],   \
                string.lowercase[20:25]]
    >>> string_var *= 5
    >>> string_var = np.asarray(sorted(string_var))
    >>> design = sm.tools.categorical(string_var, drop=True)

    Or for a numerical categorical variable

    >>> instr = np.floor(np.arange(10,60, step=2)/10)
    >>> design = sm.tools.categorical(instr, drop=True)

    With a structured array

    >>> num = np.random.randn(25,2)
    >>> struct_ar = np.zeros((25,1), dtype=[('var1', 'f4'),('var2', 'f4'),  \
                    ('instrument','f4'),('str_instr','a5')])
    >>> struct_ar['var1'] = num[:,0][:,None]
    >>> struct_ar['var2'] = num[:,1][:,None]
    >>> struct_ar['instrument'] = instr[:,None]
    >>> struct_ar['str_instr'] = string_var[:,None]
    >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True)

    Or

    >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True)
    '''
    if isinstance(col, (list, tuple)):
        try:
            assert len(col) == 1
            col = col[0]
        except:
            raise ValueError("Can only convert one column at a time")

    # TODO: add a NameValidator function
    # catch recarrays and structured arrays
    if data.dtype.names or data.__class__ is np.recarray:
        if not col and np.squeeze(data).ndim > 1:
            raise IndexError("col is None and the input array is not 1d")
        if isinstance(col, int):
            col = data.dtype.names[col]
        if col is None and data.dtype.names and len(data.dtype.names) == 1:
            col = data.dtype.names[0]

        tmp_arr = np.unique(data[col])

        # if the cols are shape (#,) vs (#,1) need to add an axis and flip
        _swap = True
        if data[col].ndim == 1:
            tmp_arr = tmp_arr[:, None]
            _swap = False
        tmp_dummy = (tmp_arr == data[col]).astype(float)
        if _swap:
            tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)

        if not tmp_arr.dtype.names:  # how do we get to this code path?
            tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)]
        elif tmp_arr.dtype.names:
            tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())]

        # prepend the varname and underscore, if col is numeric attribute
        # lookup is lost for recarrays...
        if col is None:
            try:
                col = data.dtype.names[0]
            except:
                col = 'var'
        # TODO: the above needs to be made robust because there could be many
        # var_yes, var_no varaibles for instance.
        tmp_arr = [col + '_' + item for item in tmp_arr]
        # TODO: test this for rec and structured arrays!!!

        if drop is True:
            if len(data.dtype) <= 1:
                if tmp_dummy.shape[0] < tmp_dummy.shape[1]:
                    tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)
                dt = lzip(tmp_arr, [tmp_dummy.dtype.str]*len(tmp_arr))
                # preserve array type
                return np.array(lmap(tuple, tmp_dummy.tolist()),
                                dtype=dt).view(type(data))

            data = nprf.drop_fields(data, col, usemask=False,
                                    asrecarray=type(data) is np.recarray)
        data = nprf.append_fields(data, tmp_arr, data=tmp_dummy,
                                  usemask=False,
                                  asrecarray=type(data) is np.recarray)
        return data

    # handle ndarrays and catch array-like for an error
    elif data.__class__ is np.ndarray or not isinstance(data, np.ndarray):
        if not isinstance(data, np.ndarray):
            raise NotImplementedError("Array-like objects are not supported")

        if isinstance(col, int):
            offset = data.shape[1]          # need error catching here?
            tmp_arr = np.unique(data[:, col])
            tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float)
            tmp_dummy = tmp_dummy.swapaxes(1, 0)
            if drop is True:
                offset -= 1
                data = np.delete(data, col, axis=1).astype(float)
            data = np.column_stack((data, tmp_dummy))
            if dictnames is True:
                col_map = _make_dictnames(tmp_arr, offset)
                return data, col_map
            return data
        elif col is None and np.squeeze(data).ndim == 1:
            tmp_arr = np.unique(data)
            tmp_dummy = (tmp_arr[:, None] == data).astype(float)
            tmp_dummy = tmp_dummy.swapaxes(1, 0)
            if drop is True:
                if dictnames is True:
                    col_map = _make_dictnames(tmp_arr)
                    return tmp_dummy, col_map
                return tmp_dummy
            else:
                data = np.column_stack((data, tmp_dummy))
                if dictnames is True:
                    col_map = _make_dictnames(tmp_arr, offset=1)
                    return data, col_map
                return data
        else:
            raise IndexError("The index %s is not understood" % col)
Exemplo n.º 41
0
stop_on_error = True


filelist = ['example_glsar.py', 'example_wls.py', 'example_gls.py',
            'example_glm.py', 'example_ols_tftest.py', #'example_rpy.py',
            'example_ols.py', 'example_ols_minimal.py', 'example_rlm.py',
            'example_discrete.py', 'example_predict.py',
            'example_ols_table.py',
            'tut_ols.py', 'tut_ols_rlm.py', 'tut_ols_wls.py']

use_glob = True
if use_glob:
    import glob
    filelist = glob.glob('*.py')

print(lzip(range(len(filelist)), filelist))

for fname in ['run_all.py', 'example_rpy.py']:
    filelist.remove(fname)

#filelist = filelist[15:]



#temporarily disable show
plt_show = plt.show
def noop(*args):
    pass
plt.show = noop

cont = input("""Are you sure you want to run all of the examples?
Exemplo n.º 42
0
def read_ch(fname):
    with open(fname) as f:
        lines = f.readlines()
    ps,rs,vs,qs = lzip(*[L.split(',') for L in lines])
    return lmap(float, ps), lmap(float, rs),lmap(float, vs), lmap(float, qs)
Exemplo n.º 43
0
def summary_params(results,
                   yname=None,
                   xname=None,
                   alpha=.05,
                   use_t=True,
                   skip_header=False,
                   title=None):
    '''create a summary table for the parameters

    Parameters
    ----------
    res : results instance
        some required information is directly taken from the result
        instance
    yname : string or None
        optional name for the endogenous variable, default is "y"
    xname : list of strings or None
        optional names for the exogenous variables, default is "var_xx"
    alpha : float
        significance level for the confidence intervals
    use_t : bool
        indicator whether the p-values are based on the Student-t
        distribution (if True) or on the normal distribution (if False)
    skip_headers : bool
        If false (default), then the header row is added. If true, then no
        header row is added.

    Returns
    -------
    params_table : SimpleTable instance
    '''

    # Parameters part of the summary table
    # ------------------------------------
    # Note: this is not necessary since we standardized names,
    #   only t versus normal

    if isinstance(results, tuple):
        # for multivariate endog
        # TODO: check whether I don't want to refactor this
        #we need to give parameter alpha to conf_int
        results, params, std_err, tvalues, pvalues, conf_int = results
    else:
        params = results.params
        std_err = results.bse
        tvalues = results.tvalues  #is this sometimes called zvalues
        pvalues = results.pvalues
        conf_int = results.conf_int(alpha)

    # Dictionary to store the header names for the parameter part of the
    # summary table. look up by modeltype
    if use_t:
        param_header = [
            'coef', 'std err', 't', 'P>|t|', '[' + str(alpha / 2),
            str(1 - alpha / 2) + ']'
        ]
    else:
        param_header = [
            'coef', 'std err', 'z', 'P>|z|', '[' + str(alpha / 2),
            str(1 - alpha / 2) + ']'
        ]

    if skip_header:
        param_header = None

    _, xname = _getnames(results, yname=yname, xname=xname)

    if len(xname) != len(params):
        raise ValueError('xnames and params do not have the same length')

    params_stubs = xname

    exog_idx = lrange(len(xname))

    params_data = lzip([forg(params[i], prec=4) for i in exog_idx],
                       [forg(std_err[i]) for i in exog_idx],
                       [forg(tvalues[i]) for i in exog_idx],
                       ["%#6.3f" % (pvalues[i]) for i in exog_idx],
                       [forg(conf_int[i, 0]) for i in exog_idx],
                       [forg(conf_int[i, 1]) for i in exog_idx])
    parameter_table = SimpleTable(params_data,
                                  param_header,
                                  params_stubs,
                                  title=title,
                                  txt_fmt=fmt_params)

    return parameter_table
Exemplo n.º 44
0
def acf(x, unbiased=False, nlags=40, confint=None, qstat=False, fft=False,
        alpha=None):
    '''
    Autocorrelation function for 1d arrays.

    Parameters
    ----------
    x : array
       Time series data
    unbiased : bool
       If True, then denominators for autocovariance are n-k, otherwise n
    nlags: int, optional
        Number of lags to return autocorrelation for.
    confint : scalar, optional
        The use of confint is deprecated. See `alpha`.
        If a number is given, the confidence intervals for the given level are
        returned. For instance if confint=95, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett\'s formula.
    qstat : bool, optional
        If True, returns the Ljung-Box q statistic for each autocorrelation
        coefficient.  See q_stat for more information.
    fft : bool, optional
        If True, computes the ACF via FFT.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett\'s formula.

    Returns
    -------
    acf : array
        autocorrelation function
    confint : array, optional
        Confidence intervals for the ACF. Returned if confint is not None.
    qstat : array, optional
        The Ljung-Box Q-Statistic.  Returned if q_stat is True.
    pvalues : array, optional
        The p-values associated with the Q-statistics.  Returned if q_stat is
        True.

    Notes
    -----
    The acf at lag 0 (ie., 1) is returned.

    This is based np.correlate which does full convolution. For very long time
    series it is recommended to use fft convolution instead.

    If unbiased is true, the denominator for the autocovariance is adjusted
    but the autocorrelation is not an unbiased estimtor.
    '''
    nobs = len(x)
    d = nobs  # changes if unbiased
    if not fft:
        avf = acovf(x, unbiased=unbiased, demean=True)
        #acf = np.take(avf/avf[0], range(1,nlags+1))
        acf = avf[:nlags + 1] / avf[0]
    else:
        #JP: move to acovf
        x0 = x - x.mean()
        # ensure that we always use a power of 2 or 3 for zero-padding,
        # this way we'll ensure O(n log n) runtime of the fft.
        n = _next_regular(2 * nobs + 1)
        Frf = np.fft.fft(x0, n=n)  # zero-pad for separability
        if unbiased:
            d = nobs - np.arange(nobs)
        acf = np.fft.ifft(Frf * np.conjugate(Frf))[:nobs] / d
        acf /= acf[0]
        #acf = np.take(np.real(acf), range(1,nlags+1))
        acf = np.real(acf[:nlags + 1])   # keep lag 0
    if not (confint or qstat or alpha):
        return acf
    if not confint is None:
        import warnings
        warnings.warn("confint is deprecated. Please use the alpha keyword",
                      FutureWarning)
        varacf = np.ones(nlags + 1) / nobs
        varacf[0] = 0
        varacf[1] = 1. / nobs
        varacf[2:] *= 1 + 2 * np.cumsum(acf[1:-1]**2)
        interval = stats.norm.ppf(1 - (100 - confint) / 200.) * np.sqrt(varacf)
        confint = np.array(lzip(acf - interval, acf + interval))
        if not qstat:
            return acf, confint
    if alpha is not None:
        varacf = np.ones(nlags + 1) / nobs
        varacf[0] = 0
        varacf[1] = 1. / nobs
        varacf[2:] *= 1 + 2 * np.cumsum(acf[1:-1]**2)
        interval = stats.norm.ppf(1 - alpha / 2.) * np.sqrt(varacf)
        confint = np.array(lzip(acf - interval, acf + interval))
        if not qstat:
            return acf, confint
    if qstat:
        qstat, pvalue = q_stat(acf[1:], nobs=nobs)  # drop lag 0
        if (confint is not None or alpha is not None):
            return acf, confint, qstat, pvalue
        else:
            return acf, qstat, pvalue
Exemplo n.º 45
0
                      freq=_freq_to_pandas[freq])) - 1


_quarter_to_day = {
    "1": (3, 31),
    "2": (6, 30),
    "3": (9, 30),
    "4": (12, 31),
    "I": (3, 31),
    "II": (6, 30),
    "III": (9, 30),
    "IV": (12, 31)
}

_mdays = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
_months_with_days = lzip(lrange(1, 13), _mdays)
_month_to_day = dict(zip(map(str, lrange(1, 13)), _months_with_days))
_month_to_day.update(
    dict(
        zip([
            "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI",
            "XII"
        ], _months_with_days)))

# regex patterns
_y_pattern = '^\d?\d?\d?\d$'

_q_pattern = '''
^               # beginning of string
\d?\d?\d?\d     # match any number 1-9999, includes leading zeros
Exemplo n.º 46
0
def acf(x,
        unbiased=False,
        nlags=40,
        qstat=False,
        fft=False,
        alpha=None,
        missing='none'):
    """
    Autocorrelation function for 1d arrays.

    Parameters
    ----------
    x : array
       Time series data
    unbiased : bool
       If True, then denominators for autocovariance are n-k, otherwise n
    nlags: int, optional
        Number of lags to return autocorrelation for.
    qstat : bool, optional
        If True, returns the Ljung-Box q statistic for each autocorrelation
        coefficient.  See q_stat for more information.
    fft : bool, optional
        If True, computes the ACF via FFT.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett\'s formula.
    missing : str, optional
        A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs
        are to be treated.

    Returns
    -------
    acf : array
        autocorrelation function
    confint : array, optional
        Confidence intervals for the ACF. Returned if confint is not None.
    qstat : array, optional
        The Ljung-Box Q-Statistic.  Returned if q_stat is True.
    pvalues : array, optional
        The p-values associated with the Q-statistics.  Returned if q_stat is
        True.

    Notes
    -----
    The acf at lag 0 (ie., 1) is returned.

    This is based np.correlate which does full convolution. For very long time
    series it is recommended to use fft convolution instead.

    If unbiased is true, the denominator for the autocovariance is adjusted
    but the autocorrelation is not an unbiased estimtor.

    References
    ----------
    .. [*] Parzen, E., 1963. On spectral analysis with missing observations
       and amplitude modulation. Sankhya: The Indian Journal of
       Statistics, Series A, pp.383-392.

    """
    nobs = len(x)  # should this shrink for missing='drop' and NaNs in x?
    avf = acovf(x, unbiased=unbiased, demean=True, fft=fft, missing=missing)
    acf = avf[:nlags + 1] / avf[0]
    if not (qstat or alpha):
        return acf
    if alpha is not None:
        varacf = np.ones(nlags + 1) / nobs
        varacf[0] = 0
        varacf[1] = 1. / nobs
        varacf[2:] *= 1 + 2 * np.cumsum(acf[1:-1]**2)
        interval = stats.norm.ppf(1 - alpha / 2.) * np.sqrt(varacf)
        confint = np.array(lzip(acf - interval, acf + interval))
        if not qstat:
            return acf, confint
    if qstat:
        qstat, pvalue = q_stat(acf[1:], nobs=nobs)  # drop lag 0
        if alpha is not None:
            return acf, confint, qstat, pvalue
        else:
            return acf, qstat, pvalue
Exemplo n.º 47
0
def categorical(data, col=None, dictnames=False, drop=False):
    '''
    Returns a dummy matrix given an array of categorical variables.

    Parameters
    ----------
    data : array
        A structured array, recarray, array, Series or DataFrame.  This can be
        either a 1d vector of the categorical variable or a 2d array with
        the column specifying the categorical variable specified by the col
        argument.
    col : {str, int, None}
        If data is a DataFrame col must in a column of data. If data is a
        Series, col must be either the name of the Series or None. If data is a
        structured array or a recarray, `col` can be a string that is the name
        of the column that contains the variable.  For all other
        arrays `col` can be an int that is the (zero-based) column index
        number.  `col` can only be None for a 1d array.  The default is None.
    dictnames : bool, optional
        If True, a dictionary mapping the column number to the categorical
        name is returned.  Used to have information about plain arrays.
    drop : bool
        Whether or not keep the categorical variable in the returned matrix.

    Returns
    -------
    dummy_matrix, [dictnames, optional]
        A matrix of dummy (indicator/binary) float variables for the
        categorical data.  If dictnames is True, then the dictionary
        is returned as well.

    Notes
    -----
    This returns a dummy variable for EVERY distinct variable.  If a
    a structured or recarray is provided, the names for the new variable is the
    old variable name - underscore - category name.  So if the a variable
    'vote' had answers as 'yes' or 'no' then the returned array would have to
    new variables-- 'vote_yes' and 'vote_no'.  There is currently
    no name checking.

    Examples
    --------
    >>> import numpy as np
    >>> import statsmodels.api as sm

    Univariate examples

    >>> import string
    >>> string_var = [string.ascii_lowercase[0:5], \
                      string.ascii_lowercase[5:10], \
                      string.ascii_lowercase[10:15], \
                      string.ascii_lowercase[15:20],   \
                      string.ascii_lowercase[20:25]]
    >>> string_var *= 5
    >>> string_var = np.asarray(sorted(string_var))
    >>> design = sm.tools.categorical(string_var, drop=True)

    Or for a numerical categorical variable

    >>> instr = np.floor(np.arange(10,60, step=2)/10)
    >>> design = sm.tools.categorical(instr, drop=True)

    With a structured array

    >>> num = np.random.randn(25,2)
    >>> struct_ar = np.zeros((25,1), dtype=[('var1', 'f4'),('var2', 'f4'),  \
                    ('instrument','f4'),('str_instr','a5')])
    >>> struct_ar['var1'] = num[:,0][:,None]
    >>> struct_ar['var2'] = num[:,1][:,None]
    >>> struct_ar['instrument'] = instr[:,None]
    >>> struct_ar['str_instr'] = string_var[:,None]
    >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True)

    Or

    >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True)
    '''
    # TODO: add a NameValidator function
    if isinstance(col, (list, tuple)):
        if len(col) == 1:
            col = col[0]
        else:
            raise ValueError("Can only convert one column at a time")
    if (not isinstance(data, (pd.DataFrame, pd.Series))
            and not isinstance(col, (string_types, int)) and col is not None):
        raise TypeError('col must be a str, int or None')

    # Pull out a Series from a DataFrame if provided
    if isinstance(data, pd.DataFrame):
        if col is None:
            raise TypeError('col must be a str or int when using a DataFrame')
        elif col not in data:
            raise ValueError('Column \'{0}\' not found in data'.format(col))
        data = data[col]
        # Set col to None since we not have a Series
        col = None

    if isinstance(data, pd.Series):
        if col is not None and data.name != col:
            raise ValueError('data.name does not match col '
                             '\'{0}\''.format(col))
        data_cat = pd.Categorical(data)
        dummies = pd.get_dummies(data_cat)
        col_map = {
            i: cat
            for i, cat in enumerate(data_cat.categories) if cat in dummies
        }
        if not drop:
            dummies.columns = list(dummies.columns)
            dummies = pd.concat([dummies, data], 1)
        if dictnames:
            return dummies, col_map
        return dummies
    # catch recarrays and structured arrays
    elif data.dtype.names or data.__class__ is np.recarray:
        if not col and np.squeeze(data).ndim > 1:
            raise IndexError("col is None and the input array is not 1d")
        if isinstance(col, (int, long)):
            col = data.dtype.names[col]
        if col is None and data.dtype.names and len(data.dtype.names) == 1:
            col = data.dtype.names[0]

        tmp_arr = np.unique(data[col])

        # if the cols are shape (#,) vs (#,1) need to add an axis and flip
        _swap = True
        if data[col].ndim == 1:
            tmp_arr = tmp_arr[:, None]
            _swap = False
        tmp_dummy = (tmp_arr == data[col]).astype(float)
        if _swap:
            tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)

        if not tmp_arr.dtype.names:  # how do we get to this code path?
            tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)]
        elif tmp_arr.dtype.names:
            tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())]

        # prepend the varname and underscore, if col is numeric attribute
        # lookup is lost for recarrays...
        if col is None:
            try:
                col = data.dtype.names[0]
            except:
                col = 'var'
        # TODO: the above needs to be made robust because there could be many
        # var_yes, var_no varaibles for instance.
        tmp_arr = [col + '_' + item for item in tmp_arr]
        # TODO: test this for rec and structured arrays!!!

        if drop is True:
            if len(data.dtype) <= 1:
                if tmp_dummy.shape[0] < tmp_dummy.shape[1]:
                    tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)
                dt = lzip(tmp_arr, [tmp_dummy.dtype.str] * len(tmp_arr))
                # preserve array type
                return np.array(lmap(tuple, tmp_dummy.tolist()),
                                dtype=dt).view(type(data))

            data = nprf.drop_fields(data,
                                    col,
                                    usemask=False,
                                    asrecarray=type(data) is np.recarray)
        data = nprf.append_fields(data,
                                  tmp_arr,
                                  data=tmp_dummy,
                                  usemask=False,
                                  asrecarray=type(data) is np.recarray)
        return data

    # Catch array_like for an error
    elif not isinstance(data, np.ndarray):
        raise NotImplementedError("array_like objects are not supported")
    else:
        if isinstance(col, (int, long)):
            offset = data.shape[1]  # need error catching here?
            tmp_arr = np.unique(data[:, col])
            tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float)
            tmp_dummy = tmp_dummy.swapaxes(1, 0)
            if drop is True:
                offset -= 1
                data = np.delete(data, col, axis=1).astype(float)
            data = np.column_stack((data, tmp_dummy))
            if dictnames is True:
                col_map = _make_dictnames(tmp_arr, offset)
                return data, col_map
            return data
        elif col is None and np.squeeze(data).ndim == 1:
            tmp_arr = np.unique(data)
            tmp_dummy = (tmp_arr[:, None] == data).astype(float)
            tmp_dummy = tmp_dummy.swapaxes(1, 0)
            if drop is True:
                if dictnames is True:
                    col_map = _make_dictnames(tmp_arr)
                    return tmp_dummy, col_map
                return tmp_dummy
            else:
                data = np.column_stack((data, tmp_dummy))
                if dictnames is True:
                    col_map = _make_dictnames(tmp_arr, offset=1)
                    return data, col_map
                return data
        else:
            raise IndexError("The index %s is not understood" % col)