Пример #1
0
    def handle_missing(cls, endog, exog, missing, **kwargs):
        """
        This returns a dictionary with keys endog, exog and the keys of
        kwargs. It preserves Nones.
        """
        none_array_names = []

        if exog is not None:
            combined = (endog, exog)
            combined_names = ['endog', 'exog']
        else:
            combined = (endog,)
            combined_names = ['endog']
            none_array_names += ['exog']

        # deal with other arrays
        combined_2d = ()
        combined_2d_names = []
        if len(kwargs):
            for key, value_array in iteritems(kwargs):
                if value_array is None or value_array.ndim == 0:
                    none_array_names += [key]
                    continue
                # grab 1d arrays
                if value_array.ndim == 1:
                    combined += (value_array,)
                    combined_names += [key]
                elif value_array.squeeze().ndim == 1:
                    combined += (value_array,)
                    combined_names += [key]

                # grab 2d arrays that are _assumed_ to be symmetric
                elif value_array.ndim == 2:
                    combined_2d += (value_array,)
                    combined_2d_names += [key]
                else:
                    raise ValueError("Arrays with more than 2 dimensions "
                            "aren't yet handled")

        nan_mask = _nan_rows(*combined)
        if combined_2d:
            nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)

        if missing == 'raise' and np.any(nan_mask):
            raise MissingDataError("NaNs were encountered in the data")

        elif missing == 'drop':
            nan_mask = ~nan_mask
            drop_nans = lambda x: cls._drop_nans(x, nan_mask)
            drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
            combined = dict(zip(combined_names, lmap(drop_nans, combined)))
            if combined_2d:
                combined.update(dict(zip(combined_2d_names,
                                          lmap(drop_nans_2d, combined_2d))))
            if none_array_names:
                combined.update(dict(zip(none_array_names,
                                          [None] * len(none_array_names))))
            return combined, np.where(~nan_mask)[0].tolist()
        else:
            raise ValueError("missing option %s not understood" % missing)
Пример #2
0
def test__reduce_dict():
    data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), [1] * 8))
    eq(_reduce_dict(data, ('m',)), 4)
    eq(_reduce_dict(data, ('m', 'o')), 2)
    eq(_reduce_dict(data, ('m', 'o', 'w')), 1)
    data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), lrange(8)))
    eq(_reduce_dict(data, ('m',)), 6)
    eq(_reduce_dict(data, ('m', 'o')), 1)
    eq(_reduce_dict(data, ('m', 'o', 'w')), 0)
    def simulate(self):

        group_effect_var = self.dep_params[0]

        vcomp = self.dep_params[1:]
        vcomp.append(0)

        endog, exog, group, id_matrix = [], [], [], []

        for i in range(self.ngroups):

            iterators = [lrange(n) for n in self.nest_sizes]

            # The random effects
            variances = [np.sqrt(v)*np.random.normal(size=n)
                         for v,n in zip(vcomp, self.nest_sizes)]

            gpe = np.random.normal() * np.sqrt(group_effect_var)

            nest_all = []
            for j in self.nest_sizes:
                nest_all.append(set())

            for nest in product(*iterators):

                group.append(i)

                # The sum of all random effects that apply to this
                # unit
                ref = gpe + sum([v[j] for v,j in zip(variances, nest)])

                exog1 = np.random.normal(size=5)
                exog1[0] = 1
                exog.append(exog1)

                error = ref + self.error_sd * np.random.normal()

                endog1 = np.dot(exog1, self.params) + error
                endog.append(endog1)

                for j in range(len(nest)):
                    nest_all[j].add(tuple(nest[0:j+1]))

                nest1 = [len(x)-1 for x in nest_all]
                id_matrix.append(nest1[0:-1])

        self.exog = np.array(exog)
        self.endog = np.array(endog)
        self.group = np.array(group)
        self.id_matrix = np.array(id_matrix)
        self.time = np.zeros_like(self.endog)
Пример #4
0
    def _band_quantiles(band, use_brute=use_brute, seed=seed):
        """
        Find extreme curves for a quantile band.

        From the `band` of quantiles, the associated PDF extrema values
        are computed. If `min_alpha` is not provided (single quantile value),
        `max_pdf` is set to `1E6` in order not to constrain the problem on high
        values.

        An optimization is performed per component in order to find the min and
        max curves. This is done by comparing the PDF value of a given curve
        with the band PDF.

        Parameters
        ----------
        band : array_like
            alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]``
        use_brute : bool
            Use the brute force optimizer instead of the default differential
            evolution to find the curves. Default is False.
        seed : {None, int, np.random.RandomState}
            Seed value to pass to scipy.optimize.differential_evolution. Can
            be an integer or RandomState instance. If None, then the default
            RandomState provided by np.random is used.


        Returns
        -------
        band_quantiles : list of 1-D array
            ``(max_quantile, min_quantile)`` (2, n_features)

        """
        min_pdf = pvalues[alpha.index(band[0])]
        try:
            max_pdf = pvalues[alpha.index(band[1])]
        except IndexError:
            max_pdf = 1E6
        band = [min_pdf, max_pdf]

        pool = Pool()
        data = zip(range(dim), itertools.repeat((band, pca,
                                                 bounds, ks_gaussian,
                                                 seed, use_brute)))
        band_quantiles = pool.map(_min_max_band, data)
        pool.terminate()
        pool.close()

        band_quantiles = list(zip(*band_quantiles))

        return band_quantiles
Пример #5
0
def test_freq_to_period():
    from pandas.tseries.frequencies import to_offset
    freqs = ['A', 'AS-MAR', 'Q', 'QS', 'QS-APR', 'W', 'W-MON', 'B']
    expected = [1, 1, 4, 4, 4, 52, 52, 52]
    for i, j in zip(freqs, expected):
        assert_equal(tools.freq_to_period(i), j)
        assert_equal(tools.freq_to_period(to_offset(i)), j)
Пример #6
0
def test_ic():
    #test information criteria
    #consistency check

    ics = [aic, aicc, bic, hqic]
    ics_sig = [aic_sigma, aicc_sigma, bic_sigma, hqic_sigma]

    for ic, ic_sig in zip(ics, ics_sig):
        assert_(ic(np.array(2),10,2).dtype == np.float, msg=repr(ic))
        assert_(ic_sig(np.array(2),10,2).dtype == np.float, msg=repr(ic_sig) )

        assert_almost_equal(ic(-10./2.*np.log(2.),10,2)/10,
                            ic_sig(2, 10, 2),
                            decimal=14)

        assert_almost_equal(ic_sig(np.log(2.),10,2, islog=True),
                            ic_sig(2, 10, 2),
                            decimal=14)


    #examples penalty directly from formula
    n, k = 10, 2
    assert_almost_equal(aic(0, 10, 2), 2*k, decimal=14)
    #next see Wikipedia
    assert_almost_equal(aicc(0, 10, 2),
                        aic(0, n, k) + 2*k*(k+1.)/(n-k-1.), decimal=14)
    assert_almost_equal(bic(0, 10, 2), np.log(n)*k, decimal=14)
    assert_almost_equal(hqic(0, 10, 2), 2*np.log(np.log(n))*k, decimal=14)
Пример #7
0
    def as_string(self, output_format='txt', **fmt_dict):
        """Return string: the formatted row.
        This is the default formatter for rows.
        Override this to get different formatting.
        A row formatter must accept as arguments
        a row (self) and an output format,
        one of ('html', 'txt', 'csv', 'latex').
        """
        fmt = self._get_fmt(output_format, **fmt_dict)

        # get column widths
        try:
            colwidths = self.table.get_colwidths(output_format, **fmt)
        except AttributeError:
            colwidths = fmt.get('colwidths')
        if colwidths is None:
            colwidths = (0,) * len(self)

        colsep = fmt['colsep']
        row_pre = fmt.get('row_pre', '')
        row_post = fmt.get('row_post', '')
        formatted_cells = []
        for cell, width in zip(self, colwidths):
            content = cell.format(width, output_format=output_format, **fmt)
            formatted_cells.append(content)
        formatted_row = row_pre + colsep.join(formatted_cells) + row_post
        formatted_row = self._decorate_below(formatted_row, output_format,
                                             **fmt)
        return formatted_row
Пример #8
0
    def pooled_odds_ratio(self, tables):
        """
        Returns the pooled odds ratio for a list of 2x2 tables.

        The pooled odds ratio is the inverse variance weighted average
        of the sample odds ratios of the tables.
        """

        if len(tables) == 0:
            return 1.

        # Get the sampled odds ratios and variances
        log_oddsratio, var = [], []
        for table in tables:
            lor = np.log(table[1, 1]) + np.log(table[0, 0]) -\
                np.log(table[0, 1]) - np.log(table[1, 0])
            log_oddsratio.append(lor)
            var.append((1 / table.astype(np.float64)).sum())

        # Calculate the inverse variance weighted average
        wts = [1 / v for v in var]
        wtsum = sum(wts)
        wts = [w / wtsum for w in wts]
        log_pooled_or = sum([w * e for w, e in zip(wts, log_oddsratio)])

        return np.exp(log_pooled_or)
Пример #9
0
def test_mosaic_simple():
    # display a simple plot of 4 categories of data, splitted in four
    # levels with increasing size for each group
    # creation of the levels
    key_set = (['male', 'female'], ['old', 'adult', 'young'],
               ['worker', 'unemployed'], ['healty', 'ill'])
    # the cartesian product of all the categories is
    # the complete set of categories
    keys = list(product(*key_set))
    data = OrderedDict(zip(keys, range(1, 1 + len(keys))))
    # which colours should I use for the various categories?
    # put it into a dict
    props = {}
    #males and females in blue and red
    props[('male',)] = {'color': 'b'}
    props[('female',)] = {'color': 'r'}
    # all the groups corresponding to ill groups have a different color
    for key in keys:
        if 'ill' in key:
            if 'male' in key:
                props[key] = {'color': 'BlueViolet' , 'hatch': '+'}
            else:
                props[key] = {'color': 'Crimson' , 'hatch': '+'}
    # mosaic of the data, with given gaps and colors
    mosaic(data, gap=0.05, properties=props, axes_label=False)
    pylab.suptitle('syntetic data, 4 categories (plot 2 of 4)')
    #pylab.show()
    pylab.close('all')
Пример #10
0
    def dataset(self, as_dict=False):
        """
        Returns a Python generator object for iterating over the dataset.


        Parameters
        ----------
        as_dict : bool, optional
            If as_dict is True, yield each row of observations as a dict.
            If False, yields each row of observations as a list.

        Returns
        -------
        Generator object for iterating over the dataset.  Yields each row of
        observations as a list by default.

        Notes
        -----
        If missing_values is True during instantiation of StataReader then
        observations with _StataMissingValue(s) are not filtered and should
        be handled by your applcation.
        """

        try:
            self._file.seek(self._data_location)
        except Exception:
            pass

        if as_dict:
            vars = lmap(str, self.variables())
            for i in range(len(self)):
                yield dict(zip(vars, self._next()))
        else:
            for i in range(self._header['nobs']):
                yield self._next()
Пример #11
0
    def plot2d(self,ix=0,iy=1,clf=True):
        """
        Generates a 2-dimensional plot of the data set and principle components
        using matplotlib.

        ix specifies which p-dimension to put on the x-axis of the plot
        and iy specifies which to put on the y-axis (0-indexed)
        """
        import matplotlib.pyplot as plt
        x,y=self.N[:,ix],self.N[:,iy]
        if clf:
            plt.clf()
        plt.scatter(x,y)
        vals,evs=self.getEigensystem()
        #evx,evy=evs[:,ix],evs[:,iy]
        xl,xu=plt.xlim()
        yl,yu=plt.ylim()
        dx,dy=(xu-xl),(yu-yl)
        for val,vec,c in zip(vals,evs.T,self._colors):
            plt.arrow(0,0,val*vec[ix],val*vec[iy],head_width=0.05*(dx*dy/4)**0.5,fc=c,ec=c)
        #plt.arrow(0,0,vals[ix]*evs[ix,ix],vals[ix]*evs[iy,ix],head_width=0.05*(dx*dy/4)**0.5,fc='g',ec='g')
        #plt.arrow(0,0,vals[iy]*evs[ix,iy],vals[iy]*evs[iy,iy],head_width=0.05*(dx*dy/4)**0.5,fc='r',ec='r')
        if self.names is not None:
            plt.xlabel('$'+self.names[ix]+'/\\sigma$')
            plt.ylabel('$'+self.names[iy]+'/\\sigma$')
Пример #12
0
 def evaluate(self, xeval, order=None):
     xeval = self._transform(xeval)
     if order is None:
         order = len(self.polys)
     res = sum(c*p(xeval) for c, p in list(zip(self.coeffs, self.polys))[:order])
     res = self._correction(res)
     return res
Пример #13
0
 def setup_class(cls):
     data = sm.datasets.macrodata.load_pandas()
     cls.macro_df = data.data[['year', 'quarter', 'realgdp', 'cpi']]
     cls.random_data = np.random.randn(100)
     index = [str(int(yr)) + '-Q' + str(int(qu))
              for yr, qu in zip(cls.macro_df.year, cls.macro_df.quarter)]
     cls.macro_df.index = index
     cls.series = cls.macro_df.cpi
Пример #14
0
def test_freq_to_period():
    from pandas.tseries.frequencies import to_offset

    freqs = ["A", "AS-MAR", "Q", "QS", "QS-APR", "W", "W-MON", "B", "D", "H"]
    expected = [1, 1, 4, 4, 4, 52, 52, 5, 7, 24]
    for i, j in zip(freqs, expected):
        assert_equal(tools.freq_to_period(i), j)
        assert_equal(tools.freq_to_period(to_offset(i)), j)
Пример #15
0
def test_recursive_split():
    keys = list(product('mf'))
    data = OrderedDict(zip(keys, [1] * len(keys)))
    res = _hierarchical_split(data, gap=0)
    assert_(list(iterkeys(res)) == keys)
    res[('m',)] = (0.0, 0.0, 0.5, 1.0)
    res[('f',)] = (0.5, 0.0, 0.5, 1.0)
    keys = list(product('mf', 'yao'))
    data = OrderedDict(zip(keys, [1] * len(keys)))
    res = _hierarchical_split(data, gap=0)
    assert_(list(iterkeys(res)) == keys)
    res[('m', 'y')] = (0.0, 0.0, 0.5, 1 / 3)
    res[('m', 'a')] = (0.0, 1 / 3, 0.5, 1 / 3)
    res[('m', 'o')] = (0.0, 2 / 3, 0.5, 1 / 3)
    res[('f', 'y')] = (0.5, 0.0, 0.5, 1 / 3)
    res[('f', 'a')] = (0.5, 1 / 3, 0.5, 1 / 3)
    res[('f', 'o')] = (0.5, 2 / 3, 0.5, 1 / 3)
Пример #16
0
def contrast_labels(contrasts, names, reverse=False):
    if reverse:
        sl = slice(None, None, -1)
    else:
        sl = slice(None)
    labels = [''.join(['%s%s' % (signstr(c, noplus=True),v)
                          for c,v in zip(row, names)[sl] if c != 0])
                             for row in contrasts]
    return labels
Пример #17
0
 def variables(self):
     """
     Returns a list of the dataset's StataVariables objects.
     """
     return lmap(_StataVariable, zip(lrange(self._header['nvar']),
         self._header['typlist'], self._header['varlist'],
         self._header['srtlist'],
         self._header['fmtlist'], self._header['lbllist'],
         self._header['vlblist']))
Пример #18
0
def _categories_level(keys):
    """use the Ordered dict to implement a simple ordered set
    return each level of each category
    [[key_1_level_1,key_2_level_1],[key_1_level_2,key_2_level_2]]
    """
    res = []
    for i in zip(*(keys)):
        tuplefied = _tuplify(i)
        res.append(list(OrderedDict([(j, None) for j in tuplefied])))
    return res
Пример #19
0
 def setUpClass(cls):
     data = sm.datasets.macrodata.load()
     cls.macro_data = data.data[["year", "quarter", "realgdp", "cpi"]]
     cls.random_data = np.random.randn(100)
     year = cls.macro_data["year"]
     quarter = cls.macro_data["quarter"]
     cls.macro_df = pd.DataFrame.from_records(cls.macro_data)
     index = [str(int(yr)) + "-Q" + str(int(qu)) for yr, qu in zip(cls.macro_df.year, cls.macro_df.quarter)]
     cls.macro_df.index = index
     cls.series = cls.macro_df.cpi
Пример #20
0
 def setUpClass(cls):
     data = sm.datasets.macrodata.load()
     cls.macro_df = pd.DataFrame.from_records(data.data)
     cls.macro_df = cls.macro_df[['year', 'quarter', 'realgdp', 'cpi']]
     cls.macro_data = cls.macro_df.to_records(index=False)
     cls.random_data = np.random.randn(100)
     index = [str(int(yr)) + '-Q' + str(int(qu))
              for yr, qu in zip(cls.macro_df.year, cls.macro_df.quarter)]
     cls.macro_df.index = index
     cls.series = cls.macro_df.cpi
Пример #21
0
    def setUpClass(cls):
        data = sm.datasets.macrodata.load()
        cls.macro_df = pd.DataFrame.from_records(data.data)
        cls.macro_df = cls.macro_df[['year', 'quarter', 'realgdp', 'cpi']]
        cls.macro_data = cls.macro_df.to_records(index=False)
        cls.random_data = np.random.randn(100)
        year = cls.macro_data['year']
        quarter = cls.macro_data['quarter']

        index = [str(int(yr)) + '-Q' + str(int(qu))
                 for yr, qu in zip(cls.macro_df.year, cls.macro_df.quarter)]
        cls.macro_df.index = index
        cls.series = cls.macro_df.cpi
Пример #22
0
 def extend_right(self, table):
     """Return None.
     Extend each row of `self` with corresponding row of `table`.
     Does **not** import formatting from ``table``.
     This generally makes sense only if the two tables have
     the same number of rows, but that is not enforced.
     :note: To extend append a table below, just use `extend`,
     which is the ordinary list method.  This generally makes sense
     only if the two tables have the same number of columns,
     but that is not enforced.
     """
     for row1, row2 in zip(self, table):
         row1.extend(row2)
Пример #23
0
    def smooth(self, xs, ys, x):
        """Returns the kernel smoothing estimate for point x based on x-values
        xs and y-values ys.
        Not expected to be called by the user.
        """
        xs, ys = self.in_domain(xs, ys, x)

        if len(xs)>0:
            w = np.sum(self((xs-x)/self.h))
            #TODO: change the below to broadcasting when shape is sorted
            v = np.sum([yy*self((xx-x)/self.h) for xx, yy in zip(xs, ys)])
            return v / w
        else:
            return np.nan
Пример #24
0
    def smoothvar(self, xs, ys, x):
        """Returns the kernel smoothing estimate of the variance at point x.
        """
        xs, ys = self.in_domain(xs, ys, x)

        if len(xs) > 0:
            fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
            sqresid = square(subtract(ys, fittedvals))
            w = np.sum(self((xs - x) / self.h))
            v = np.sum(
                [rr * self((xx - x) / self.h) for xx, rr in zip(xs, sqresid)])
            return v / w
        else:
            return np.nan
Пример #25
0
def test_mosaic_very_complex():
    # make a scattermatrix of mosaic plots to show the correlations between
    # each pair of variable in a dataset. Could be easily converted into a
    # new function that does this automatically based on the type of data
    key_name = ['gender', 'age', 'health', 'work']
    key_base = (['male', 'female'], ['old',
                                     'young'], ['healty',
                                                'ill'], ['work', 'unemployed'])
    keys = list(product(*key_base))
    data = OrderedDict(zip(keys, range(1, 1 + len(keys))))
    props = {}
    props[('male', 'old')] = {'color': 'r'}
    props[('female', )] = {'color': 'pink'}
    L = len(key_base)
    fig, axes = pylab.subplots(L, L)
    for i in range(L):
        for j in range(L):
            m = set(range(L)).difference(set((i, j)))
            if i == j:
                axes[i, i].text(0.5,
                                0.5,
                                key_name[i],
                                ha='center',
                                va='center')
                axes[i, i].set_xticks([])
                axes[i, i].set_xticklabels([])
                axes[i, i].set_yticks([])
                axes[i, i].set_yticklabels([])
            else:
                ji = max(i, j)
                ij = min(i, j)
                temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r]
                                                                 for r in m),
                                          v) for k, v in iteritems(data)])

                keys = list(iterkeys(temp_data))
                for k in keys:
                    value = _reduce_dict(temp_data, k[:2])
                    temp_data[k[:2]] = value
                    del temp_data[k]
                mosaic(temp_data,
                       ax=axes[i, j],
                       axes_label=False,
                       properties=props,
                       gap=0.05,
                       horizontal=i > j)
    pylab.suptitle('old males should look bright red,  (plot 4 of 4)')
    #pylab.show()
    pylab.close('all')
Пример #26
0
def test_axes_labeling(close_figures):
    from numpy.random import rand
    key_set = (['male', 'female'], ['old', 'adult', 'young'],
               ['worker', 'unemployed'], ['yes', 'no'])
    # the cartesian product of all the categories is
    # the complete set of categories
    keys = list(product(*key_set))
    data = OrderedDict(zip(keys, rand(len(keys))))
    lab = lambda k: ''.join(s[0] for s in k)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    mosaic(data, ax=ax1, labelizer=lab, horizontal=True, label_rotation=45)
    mosaic(data, ax=ax2, labelizer=lab, horizontal=False,
           label_rotation=[0, 45, 90, 0])
    #fig.tight_layout()
    fig.suptitle("correct alignment of the axes labels")
Пример #27
0
    def summary(self):
        '''returns text summarizing the results

        uses the default pvalue correction of the instance stored in
        ``self.multitest_method``
        '''
        import statsmodels.stats.multitest as smt
        maxlevel = max((len(ss) for ss in self.all_pairs_names))

        text = 'Corrected p-values using %s p-value correction\n\n' % \
                        smt.multitest_methods_names[self.multitest_method]
        text += 'Pairs' + (' ' * (maxlevel - 5 + 1)) + 'p-values\n'
        text += '\n'.join(('%s  %6.4g' % (pairs, pv) for (pairs, pv) in
                zip(self.all_pairs_names, self.pval_corrected())))
        return text
Пример #28
0
    def setup_class(cls):
        data = sm.datasets.macrodata.load_pandas()
        cls.macro_df = data.data[['year', 'quarter', 'realgdp', 'cpi']]
        cols = list(cls.macro_df.columns)
        cls.realgdp_loc = cols.index('realgdp')
        cls.cpi_loc = cols.index('cpi')
        cls.random_data = np.random.randn(100)
        year = cls.macro_df['year'].values
        quarter = cls.macro_df['quarter'].values

        index = [
            str(int(yr)) + '-Q' + str(int(qu))
            for yr, qu in zip(cls.macro_df.year, cls.macro_df.quarter)
        ]
        cls.macro_df.index = index
        cls.series = cls.macro_df.cpi
Пример #29
0
    def ftest_summary(self):
        '''run all ftests on the joint model

        Returns
        -------
        fres : str
           a string that lists the results of all individual f-tests
        summarytable : list of tuples
           contains (pair, (fvalue, pvalue,df_denom, df_num)) for each f-test

        Note
        ----
        This are the raw results and not formatted for nice printing.

        '''
        if not hasattr(self, 'lsjoint'):
            self.fitjoint()
        txt = []
        summarytable = []

        txt.append('F-test for equality of coefficients across groups')
        fres = self.lsjoint.f_test(self.contrasts['all'])
        txt.append(fres.__str__())
        summarytable.append(
            ('all', (fres.fvalue, fres.pvalue, fres.df_denom, fres.df_num)))

        #        for group in self.unique[1:]:  #replace with group1, group2 in sorted(keys)
        #            txt.append('F-test for equality of coefficients between group'
        #                       ' %s and group %s' % (group, '0'))
        #            fres = self.lsjoint.f_test(self.contrasts[group])
        #            txt.append(fres.__str__())
        #            summarytable.append((group,(fres.fvalue, fres.pvalue, fres.df_denom, fres.df_num)))
        pairs = np.triu_indices(len(self.unique), 1)
        for ind1, ind2 in zip(
                *pairs):  #replace with group1, group2 in sorted(keys)
            g1 = self.unique[ind1]
            g2 = self.unique[ind2]
            txt.append('F-test for equality of coefficients between group'
                       ' %s and group %s' % (g1, g2))
            group = (g1, g2)
            fres = self.lsjoint.f_test(self.contrasts[group])
            txt.append(fres.__str__())
            summarytable.append((group, (fres.fvalue, fres.pvalue,
                                         fres.df_denom, fres.df_num)))

        self.summarytable = summarytable
        return '\n'.join(txt), summarytable
Пример #30
0
def _key_splitting(rect_dict, keys, values, key_subset, horizontal, gap):
    """
    Given a dictionary where each entry  is a rectangle, a list of key and
    value (count of elements in each category) it split each rect accordingly,
    as long as the key start with the tuple key_subset.  The other keys are
    returned without modification.
    """
    result = OrderedDict()
    L = len(key_subset)
    for name, (x, y, w, h) in iteritems(rect_dict):
        if key_subset == name[:L]:
            # split base on the values given
            divisions = _split_rect(x, y, w, h, values, horizontal, gap)
            for key, rect in zip(keys, divisions):
                result[name + (key, )] = rect
        else:
            result[name] = (x, y, w, h)
    return result
Пример #31
0
def density_orthopoly(x, polybase, order=5, xeval=None):
    #polybase = legendre  #chebyt #hermitenorm#
    #polybase = chebyt
    #polybase = FPoly
    #polybase = ChtPoly
    #polybase = hermite
    #polybase = HPoly

    if xeval is None:
        xeval = np.linspace(x.min(), x.max(), 50)

    #polys = [legendre(i) for i in range(order)]
    polys = [polybase(i) for i in range(order)]
    #coeffs = [(p(x)*(1-x**2)**(-1/2.)).mean() for p in polys]
    #coeffs = [(p(x)*np.exp(-x*x)).mean() for p in polys]
    coeffs = [(p(x)).mean() for p in polys]
    res = sum(c * p(xeval) for c, p in zip(coeffs, polys))
    #res *= (1-xeval**2)**(-1/2.)
    #res *= np.exp(-xeval**2./2)
    return res, xeval, coeffs, polys
Пример #32
0
 def _get_colwidths(self, output_format, **fmt_dict):
     """Return list, the calculated widths of each column."""
     output_format = get_output_format(output_format)
     fmt = self.output_formats[output_format].copy()
     fmt.update(fmt_dict)
     ncols = max(len(row) for row in self)
     request = fmt.get('colwidths')
     if request == 0:  # assume no extra space desired (e.g, CSV)
         return [0] * ncols
     elif request is None:  # assume no extra space desired (e.g, CSV)
         request = [0] * ncols
     elif isinstance(request, (int, long)):
         request = [request] * ncols
     elif len(request) < ncols:
         request = [request[i % len(request)] for i in range(ncols)]
     min_widths = []
     for col in zip(*self):
         maxwidth = max(len(c.format(0, output_format, **fmt)) for c in col)
         min_widths.append(maxwidth)
     result = lmap(max, min_widths, request)
     return result
Пример #33
0
def _split_rect(x, y, width, height, proportion, horizontal=True, gap=0.05):
    """
    Split the given rectangle in n segments whose proportion is specified
    along the given axis if a gap is inserted, they will be separated by a
    certain amount of space, retaining the relative proportion between them
    a gap of 1 correspond to a plot that is half void and the remaining half
    space is proportionally divided among the pieces.
    """
    x, y, w, h = float(x), float(y), float(width), float(height)
    if (w < 0) or (h < 0):
        raise ValueError("dimension of the square less than"
                         "zero w={} h=()".format(w, h))
    proportions = _normalize_split(proportion)

    # extract the starting point and the dimension of each subdivision
    # in respect to the unit square
    starting = proportions[:-1]
    amplitude = proportions[1:] - starting

    # how much each extrema is going to be displaced due to gaps
    starting += gap * np.arange(len(proportions) - 1)

    # how much the squares plus the gaps are extended
    extension = starting[-1] + amplitude[-1] - starting[0]

    # normalize everything for fit again in the original dimension
    starting /= extension
    amplitude /= extension

    # bring everything to the original square
    starting = (x if horizontal else y) + starting * (w if horizontal else h)
    amplitude = amplitude * (w if horizontal else h)

    # create each 4-tuple for each new block
    results = [(s, y, a, h) if horizontal else (x, s, w, a)
               for s, a in zip(starting, amplitude)]
    return results
Пример #34
0
    def smoothconf(self, xs, ys, x, alpha=0.05):
        """Returns the kernel smoothing estimate with confidence 1sigma bounds
        """
        xs, ys = self.in_domain(xs, ys, x)

        if len(xs) > 0:
            fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
            #fittedvals = self.smooth(xs, ys, x) # x or xs in Haerdle
            sqresid = square(
                subtract(ys, fittedvals)
            )
            w = np.sum(self((xs-x)/self.h))
            #var = sqresid.sum() / (len(sqresid) - 0)  # nonlocal var ? JP just trying
            v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
            var = v / w
            sd = np.sqrt(var)
            K = self.L2Norm
            yhat = self.smooth(xs, ys, x)
            from scipy import stats
            crit = stats.norm.isf(alpha / 2)
            err = crit * sd * np.sqrt(K) / np.sqrt(w * self.h * self.norm_const)
            return (yhat - err, yhat, yhat + err)
        else:
            return (np.nan, np.nan, np.nan)
Пример #35
0
def arma_order_select_ic(y, max_ar=4, max_ma=2, ic='bic', trend='c',
                         model_kw={}, fit_kw={}):
    """
    Returns information criteria for many ARMA models

    Parameters
    ----------
    y : array-like
        Time-series data
    max_ar : int
        Maximum number of AR lags to use. Default 4.
    max_ma : int
        Maximum number of MA lags to use. Default 2.
    ic : str, list
        Information criteria to report. Either a single string or a list
        of different criteria is possible.
    trend : str
        The trend to use when fitting the ARMA models.
    model_kw : dict
        Keyword arguments to be passed to the ``ARMA`` model
    fit_kw : dict
        Keyword arguments to be passed to ``ARMA.fit``.

    Returns
    -------
    obj : Results object
        Each ic is an attribute with a DataFrame for the results. The AR order
        used is the row index. The ma order used is the column index. The
        minimum orders are available as ``ic_min_order``.

    Examples
    --------

    >>> from statsmodels.tsa.arima_process import arma_generate_sample
    >>> import statsmodels.api as sm
    >>> import numpy as np

    >>> arparams = np.array([.75, -.25])
    >>> maparams = np.array([.65, .35])
    >>> arparams = np.r_[1, -arparams]
    >>> maparam = np.r_[1, maparams]
    >>> nobs = 250
    >>> np.random.seed(2014)
    >>> y = arma_generate_sample(arparams, maparams, nobs)
    >>> res = sm.tsa.arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc')
    >>> res.aic_min_order
    >>> res.bic_min_order

    Notes
    -----
    This method can be used to tentatively identify the order of an ARMA
from    process, provided that the time series is stationary and invertible. This
    function computes the full exact MLE estimate of each model and can be,
    therefore a little slow. An implementation using approximate estimates
    will be provided in the future. In the meantime, consider passing
    {method : 'css'} to fit_kw.
    """
    from pandas import DataFrame

    ar_range = lrange(0, max_ar + 1)
    ma_range = lrange(0, max_ma + 1)
    if isinstance(ic, string_types):
        ic = [ic]
    elif not isinstance(ic, (list, tuple)):
        raise ValueError("Need a list or a tuple for ic if not a string.")

    results = np.zeros((len(ic), max_ar + 1, max_ma + 1))

    for ar in ar_range:
        for ma in ma_range:
            if ar == 0 and ma == 0 and trend == 'nc':
                results[:, ar, ma] = np.nan
                continue

            mod = _safe_arma_fit(y, (ar, ma), model_kw, trend, fit_kw)
            if mod is None:
                results[:, ar, ma] = np.nan
                continue

            for i, criteria in enumerate(ic):
                results[i, ar, ma] = getattr(mod, criteria)

    dfs = [DataFrame(res, columns=ma_range, index=ar_range) for res in results]

    res = dict(zip(ic, dfs))

    # add the minimums to the results dict
    min_res = {}
    for i, result in iteritems(res):
        mins = np.where(result.min().min() == result)
        min_res.update({i + '_min_order' : (mins[0][0], mins[1][0])})
    res.update(min_res)

    return Bunch(**res)
Пример #36
0
import numpy as np

_quarter_to_day = {
    "1": (3, 31),
    "2": (6, 30),
    "3": (9, 30),
    "4": (12, 31),
    "I": (3, 31),
    "II": (6, 30),
    "III": (9, 30),
    "IV": (12, 31)
}

_mdays = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
_months_with_days = lzip(lrange(1, 13), _mdays)
_month_to_day = dict(zip(map(str, lrange(1, 13)), _months_with_days))
_month_to_day.update(
    dict(
        zip([
            "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI",
            "XII"
        ], _months_with_days)))

# regex patterns
_y_pattern = r'^\d?\d?\d?\d$'

_q_pattern = r'''
^               # beginning of string
\d?\d?\d?\d     # match any number 1-9999, includes leading zeros

(:?q)           # use q or a : as a separator
Пример #37
0
    endog = 5  # dummy place holder

    ##############  Example similar to Greene

    #get pickled data
    #endog3, xifloat3 = cPickle.load(open('xifloat2.pickle','rb'))

    tree0 = ('top', [('Fly', ['Air']), ('Ground', ['Train', 'Car', 'Bus'])])
    ''' this is with real data from Greene's clogit example
    datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'],
                        [xifloat[i]for i in range(4)]))
    '''

    #for testing only (mock that returns it's own name
    datadict = dict(
        zip(['Air', 'Train', 'Bus', 'Car'],
            ['Airdata', 'Traindata', 'Busdata', 'Cardata']))

    if testxb:
        datadict = dict(zip(['Air', 'Train', 'Bus', 'Car'], np.arange(4)))

    datadict.update({'top': [], 'Fly': [], 'Ground': []})

    paramsind = {
        'top': [],
        'Fly': [],
        'Ground': [],
        'Air': ['GC', 'Ttme', 'ConstA', 'Hinc'],
        'Train': ['GC', 'Ttme', 'ConstT'],
        'Bus': ['GC', 'Ttme', 'ConstB'],
        'Car': ['GC', 'Ttme']
    }
Пример #38
0
    plt.figure()
    plt.plot(exog)
    #plt.plot(p, '.', lw=2)
    plt.plot(y_true, lw=2)

    y_pred = m.results.mu  # + m.results.alpha #m.results.predict(d)
    plt.figure()
    plt.subplot(2, 2, 1)
    plt.plot(p, '.')
    plt.plot(yp, 'b-', label='true')
    plt.plot(y_pred, 'r-', label='GAM')
    plt.legend(loc='upper left')
    plt.title('gam.GAM Poisson')

    counter = 2
    for ii, xx in zip(['z', 'x1', 'x2'], [z, x[:, 0], x[:, 1]]):
        sortidx = np.argsort(xx)
        #plt.figure()
        plt.subplot(2, 2, counter)
        plt.plot(xx[sortidx], p[sortidx], 'k.', alpha=0.5)
        plt.plot(xx[sortidx], yp[sortidx], 'b.', label='true')
        plt.plot(xx[sortidx], y_pred[sortidx], 'r.', label='GAM')
        plt.legend(loc='upper left')
        plt.title('gam.GAM Poisson ' + ii)
        counter += 1

    res = GLM(p, exog_reduced, family=f).fit()

    #plot component, compared to true component
    x1 = x[:, 0]
    x2 = x[:, 1]
Пример #39
0
def summary_col(results,
                float_format='%.4f',
                model_names=[],
                stars=False,
                info_dict=None,
                regressor_order=[]):
    """
    Summarize multiple results instances side-by-side (coefs and SEs)

    Parameters
    ----------
    results : statsmodels results instance or list of result instances
    float_format : string
        float format for coefficients and standard errors
        Default : '%.4f'
    model_names : list of strings of length len(results) if the names are not
        unique, a roman number will be appended to all model names
    stars : bool
        print significance stars
    info_dict : dict
        dict of lambda functions to be applied to results instances to retrieve
        model info. To use specific information for different models, add a
        (nested) info_dict with model name as the key.
        Example: `info_dict = {"N":..., "R2": ..., "OLS":{"R2":...}}` would
        only show `R2` for OLS regression models, but additionally `N` for
        all other results.
        Default : None (use the info_dict specified in
        result.default_model_infos, if this property exists)
    regressor_order : list of strings
        list of names of the regressors in the desired order. All regressors
        not specified will be appended to the end of the list.
    """

    if not isinstance(results, list):
        results = [results]

    cols = [
        _col_params(x, stars=stars, float_format=float_format) for x in results
    ]

    # Unique column names (pandas has problems merging otherwise)
    if model_names:
        colnames = _make_unique(model_names)
    else:
        colnames = _make_unique([x.columns[0] for x in cols])
    for i in range(len(cols)):
        cols[i].columns = [colnames[i]]

    merg = lambda x, y: x.merge(
        y, how='outer', right_index=True, left_index=True)
    summ = reduce(merg, cols)

    if regressor_order:
        varnames = summ.index.get_level_values(0).tolist()
        ordered = [x for x in regressor_order if x in varnames]
        unordered = [x for x in varnames if x not in regressor_order + ['']]
        order = ordered + list(np.unique(unordered))

        f = lambda idx: sum([[x + 'coef', x + 'stde'] for x in idx], [])
        summ.index = f(np.unique(varnames))
        summ = summ.reindex(f(order))
        summ.index = [x[:-4] for x in summ.index]

    idx = pd.Series(lrange(summ.shape[0])) % 2 == 1
    summ.index = np.where(idx, '', summ.index.get_level_values(0))

    # add infos about the models.
    if info_dict:
        cols = [
            _col_info(x, info_dict.get(x.model.__class__.__name__, info_dict))
            for x in results
        ]
    else:
        cols = [
            _col_info(x, getattr(x, "default_model_infos", None))
            for x in results
        ]
    # use unique column names, otherwise the merge will not succeed
    for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])):
        df.columns = [name]
    merg = lambda x, y: x.merge(
        y, how='outer', right_index=True, left_index=True)
    info = reduce(merg, cols)
    dat = pd.DataFrame(np.vstack([summ, info]))  # pd.concat better, but error
    dat.columns = summ.columns
    dat.index = pd.Index(summ.index.tolist() + info.index.tolist())
    summ = dat

    summ = summ.fillna('')

    smry = Summary()
    smry.add_df(summ, header=True, align='l')
    smry.add_text('Standard errors in parentheses.')
    if stars:
        smry.add_text('* p<.1, ** p<.05, ***p<.01')

    return smry
Пример #40
0
    def handle_missing(cls, endog, exog, missing, **kwargs):
        """
        This returns a dictionary with keys endog, exog and the keys of
        kwargs. It preserves Nones.
        """
        none_array_names = []

        # patsy's already dropped NaNs in y/X
        missing_idx = kwargs.pop('missing_idx', None)

        if missing_idx is not None:
            # y, X already handled by patsy. add back in later.
            combined = ()
            combined_names = []
            if exog is None:
                none_array_names += ['exog']
        elif exog is not None:
            combined = (endog, exog)
            combined_names = ['endog', 'exog']
        else:
            combined = (endog, )
            combined_names = ['endog']
            none_array_names += ['exog']

        # deal with other arrays
        combined_2d = ()
        combined_2d_names = []
        if len(kwargs):
            for key, value_array in iteritems(kwargs):
                if value_array is None or value_array.ndim == 0:
                    none_array_names += [key]
                    continue
                # grab 1d arrays
                if value_array.ndim == 1:
                    combined += (np.asarray(value_array), )
                    combined_names += [key]
                elif value_array.squeeze().ndim == 1:
                    combined += (np.asarray(value_array), )
                    combined_names += [key]

                # grab 2d arrays that are _assumed_ to be symmetric
                elif value_array.ndim == 2:
                    combined_2d += (np.asarray(value_array), )
                    combined_2d_names += [key]
                else:
                    raise ValueError("Arrays with more than 2 dimensions "
                                     "aren't yet handled")

        if missing_idx is not None:
            nan_mask = missing_idx
            updated_row_mask = None
            if combined:  # there were extra arrays not handled by patsy
                combined_nans = _nan_rows(*combined)
                if combined_nans.shape[0] != nan_mask.shape[0]:
                    raise ValueError("Shape mismatch between endog/exog "
                                     "and extra arrays given to model.")
                # for going back and updated endog/exog
                updated_row_mask = combined_nans[~nan_mask]
                nan_mask |= combined_nans  # for updating extra arrays only
            if combined_2d:
                combined_2d_nans = _nan_rows(combined_2d)
                if combined_2d_nans.shape[0] != nan_mask.shape[0]:
                    raise ValueError("Shape mismatch between endog/exog "
                                     "and extra 2d arrays given to model.")
                if updated_row_mask is not None:
                    updated_row_mask |= combined_2d_nans[~nan_mask]
                else:
                    updated_row_mask = combined_2d_nans[~nan_mask]
                nan_mask |= combined_2d_nans

        else:
            nan_mask = _nan_rows(*combined)
            if combined_2d:
                nan_mask = _nan_rows(*(nan_mask[:, None], ) + combined_2d)

        if not np.any(nan_mask):  # no missing don't do anything
            combined = dict(zip(combined_names, combined))
            if combined_2d:
                combined.update(dict(zip(combined_2d_names, combined_2d)))
            if none_array_names:
                combined.update(
                    dict(zip(none_array_names,
                             [None] * len(none_array_names))))

            if missing_idx is not None:
                combined.update({'endog': endog})
                if exog is not None:
                    combined.update({'exog': exog})

            return combined, []

        elif missing == 'raise':
            raise MissingDataError("NaNs were encountered in the data")

        elif missing == 'drop':
            nan_mask = ~nan_mask
            drop_nans = lambda x: cls._drop_nans(x, nan_mask)
            drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
            combined = dict(zip(combined_names, lmap(drop_nans, combined)))

            if missing_idx is not None:
                if updated_row_mask is not None:
                    updated_row_mask = ~updated_row_mask
                    # update endog/exog with this new information
                    endog = cls._drop_nans(endog, updated_row_mask)
                    if exog is not None:
                        exog = cls._drop_nans(exog, updated_row_mask)

                combined.update({'endog': endog})
                if exog is not None:
                    combined.update({'exog': exog})

            if combined_2d:
                combined.update(
                    dict(
                        zip(combined_2d_names, lmap(drop_nans_2d,
                                                    combined_2d))))
            if none_array_names:
                combined.update(
                    dict(zip(none_array_names,
                             [None] * len(none_array_names))))

            return combined, np.where(~nan_mask)[0].tolist()
        else:
            raise ValueError("missing option %s not understood" % missing)
Пример #41
0
>>> contrast.Contrast(formula.Term('(ff==a)'), fac).matrix
Traceback (most recent call last):
  File "c:\...\scikits\statsmodels\sandbox\contrast_old.py", line 112, in _get_matrix
    self.compute_matrix()
  File "c:\...\scikits\statsmodels\sandbox\contrast_old.py", line 91, in compute_matrix
    T = np.transpose(np.array(t(*args, **kw)))
  File "c:\...\scikits\statsmodels\sandbox\formula.py", line 150, in __call__
    If the term has no 'func' attribute, it returns
KeyError: '(ff==a)'
'''

#convert factor to formula

f7 = formula.Formula(fac)
# explicit updating of namespace with
f7.namespace.update(dict(zip(fac.names(), fac())))

# contrast matrix with 2 of 3 terms
contrast.Contrast(formula.Term('(ff==b)') + formula.Term('(ff==a)'), f7).matrix
#array([[ 1.,  0.,  0.],
#       [ 0.,  1.,  0.]])

# contrast matrix for all terms
contrast.Contrast(f7, f7).matrix
#array([[ 1.,  0.,  0.],
#       [ 0.,  1.,  0.],
#       [ 0.,  0.,  1.]])

# contrast matrix for difference groups 1,2 versus group 0
contrast.Contrast(formula.Term('(ff==b)') + formula.Term('(ff==c)'),
                  f7).matrix - contrast.Contrast(formula.Term('(ff==a)'),
sige = 0.1
nobs, k_vars = 500, 3
x = np.random.uniform(-1, 1, size=nobs)
x.sort()
exog = np.vander(x, k_vars + 1)[:, ::-1]
mix = 0.1 * stats.norm.pdf(
    x[:, None], loc=np.linspace(-0.5, 0.75, 4), scale=0.01).sum(1)
y = exog.sum(1) + mix + sige * (np.random.randn(nobs) / 2 + 1)**3

p = 0.5
res_qr = QuantReg(y, exog).fit(p)
res_qr2 = QuantReg(y, exog).fit(0.1)
res_qr3 = QuantReg(y, exog).fit(0.75)
res_ols = sm.OLS(y, exog).fit()

params = [res_ols.params, res_qr2.params, res_qr.params, res_qr3.params]
labels = ['ols', 'qr 0.1', 'qr 0.5', 'qr 0.75']

plt.figure()
plt.plot(x, y, '.', alpha=0.5)
for lab, beta in zip(['ols', 'qr 0.1', 'qr 0.5', 'qr 0.75'], params):
    print('%-8s' % lab, np.round(beta, 4))
    fitted = np.dot(exog, beta)
    lw = 2
    plt.plot(x, fitted, lw=lw, label=lab)
plt.legend()
plt.title('Quantile Regression')

plt.show()
Пример #43
0
def hdrboxplot(data,
               ncomp=2,
               alpha=None,
               threshold=0.95,
               bw=None,
               xdata=None,
               labels=None,
               ax=None,
               use_brute=False,
               seed=None):
    """
    High Density Region boxplot

    Parameters
    ----------
    data : sequence of ndarrays or 2-D ndarray
        The vectors of functions to create a functional boxplot from.  If a
        sequence of 1-D arrays, these should all be the same size.
        The first axis is the function index, the second axis the one along
        which the function is defined.  So ``data[0, :]`` is the first
        functional curve.
    ncomp : int, optional
        Number of components to use.  If None, returns the as many as the
        smaller of the number of rows or columns in data.
    alpha : list of floats between 0 and 1, optional
        Extra quantile values to compute. Default is None
    threshold : float between 0 and 1, optional
        Percentile threshold value for outliers detection. High value means
        a lower sensitivity to outliers. Default is `0.95`.
    bw: array_like or str, optional
        If an array, it is a fixed user-specified bandwidth. If `None`, set to
        `normal_reference`. If a string, should be one of:

            - normal_reference: normal reference rule of thumb (default)
            - cv_ml: cross validation maximum likelihood
            - cv_ls: cross validation least squares

    xdata : ndarray, optional
        The independent variable for the data. If not given, it is assumed to
        be an array of integers 0..N-1 with N the length of the vectors in
        `data`.
    labels : sequence of scalar or str, optional
        The labels or identifiers of the curves in `data`. If not given,
        outliers are labeled in the plot with array indices.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    use_brute : bool
        Use the brute force optimizer instead of the default differential
        evolution to find the curves. Default is False.
    seed : {None, int, np.random.RandomState}
        Seed value to pass to scipy.optimize.differential_evolution. Can be an
        integer or RandomState instance. If None, then the default RandomState
        provided by np.random is used.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    hdr_res : HdrResults instance
        An `HdrResults` instance with the following attributes:

         - 'median', array. Median curve.
         - 'hdr_50', array. 50% quantile band. [sup, inf] curves
         - 'hdr_90', list of array. 90% quantile band. [sup, inf]
            curves.
         - 'extra_quantiles', list of array. Extra quantile band.
            [sup, inf] curves.
         - 'outliers', ndarray. Outlier curves.

    Notes
    -----
    The median curve is the curve with the highest probability on the reduced
    space of a Principal Component Analysis (PCA).

    Outliers are defined as curves that fall outside the band corresponding
    to the quantile given by `threshold`.

    The non-outlying region is defined as the band made up of all the
    non-outlying curves.

    Behind the scene, the dataset is represented as a matrix. Each line
    corresponding to a 1D curve. This matrix is then decomposed using Principal
    Components Analysis (PCA). This allows to represent the data using a finite
    number of modes, or components. This compression process allows to turn the
    functional representation into a scalar representation of the matrix. In
    other words, you can visualize each curve from its components. Each curve
    is thus a point in this reduced space. With 2 components, this is called a
    bivariate plot (2D plot).

    In this plot, if some points are adjacent (similar components), it means
    that back in the original space, the curves are similar. Then, finding the
    median curve means finding the higher density region (HDR) in the reduced
    space. Moreover, the more you get away from this HDR, the more the curve is
    unlikely to be similar to the other curves.

    Using a kernel smoothing technique, the probability density function (PDF)
    of the multivariate space can be recovered. From this PDF, it is possible
    to compute the density probability linked to the cluster of points and plot
    its contours.

    Finally, using these contours, the different quantiles can be extracted
    along with the median curve and the outliers.

    Steps to produce the HDR boxplot include:

    1. Compute a multivariate kernel density estimation
    2. Compute contour lines for quantiles 90%, 50% and `alpha` %
    3. Plot the bivariate plot
    4. Compute median curve along with quantiles and outliers curves.

    References
    ----------
    [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for
        Functional Data", vol. 19, pp. 29-45, 2010.

    Examples
    --------
    Load the El Nino dataset.  Consists of 60 years worth of Pacific Ocean sea
    surface temperature data.

    >>> import matplotlib.pyplot as plt
    >>> import statsmodels.api as sm
    >>> data = sm.datasets.elnino.load(as_pandas=False)

    Create a functional boxplot.  We see that the years 1982-83 and 1997-98 are
    outliers; these are the years where El Nino (a climate pattern
    characterized by warming up of the sea surface and higher air pressures)
    occurred with unusual intensity.

    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:],
    ...                              labels=data.raw_data[:, 0].astype(int),
    ...                              ax=ax)

    >>> ax.set_xlabel("Month of the year")
    >>> ax.set_ylabel("Sea surface temperature (C)")
    >>> ax.set_xticks(np.arange(13, step=3) - 1)
    >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"])
    >>> ax.set_xlim([-0.2, 11.2])

    >>> plt.show()

    .. plot:: plots/graphics_functional_hdrboxplot.py

    See Also
    --------
    banddepth, rainbowplot, fboxplot
    """
    fig, ax = utils.create_mpl_ax(ax)

    if labels is None:
        # For use with pandas, get the labels
        if hasattr(data, 'index'):
            labels = data.index
        else:
            labels = np.arange(len(data))

    data = np.asarray(data)
    if xdata is None:
        xdata = np.arange(data.shape[1])

    n_samples, dim = data.shape
    # PCA and bivariate plot
    pca = PCA(data, ncomp=ncomp)
    data_r = pca.factors

    # Create gaussian kernel
    ks_gaussian = KDEMultivariate(data_r,
                                  bw=bw,
                                  var_type='c' * data_r.shape[1])

    # Boundaries of the n-variate space
    bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T

    # Compute contour line of pvalue linked to a given probability level
    if alpha is None:
        alpha = [threshold, 0.9, 0.5]
    else:
        alpha.extend([threshold, 0.9, 0.5])
        alpha = list(set(alpha))
    alpha.sort(reverse=True)

    n_quantiles = len(alpha)
    pdf_r = ks_gaussian.pdf(data_r).flatten()
    pvalues = [
        np.percentile(pdf_r, (1 - alpha[i]) * 100, interpolation='linear')
        for i in range(n_quantiles)
    ]

    # Find mean, outliers curves
    if have_de_optim and not use_brute:
        median = differential_evolution(lambda x: -ks_gaussian.pdf(x),
                                        bounds=bounds,
                                        maxiter=5,
                                        seed=seed).x
    else:
        median = brute(lambda x: -ks_gaussian.pdf(x),
                       ranges=bounds,
                       finish=fmin)

    outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0]
    labels_outlier = [labels[i] for i in outliers_idx]
    outliers = data[outliers_idx]

    # Find HDR given some quantiles

    def _band_quantiles(band, use_brute=use_brute, seed=seed):
        """
        Find extreme curves for a quantile band.

        From the `band` of quantiles, the associated PDF extrema values
        are computed. If `min_alpha` is not provided (single quantile value),
        `max_pdf` is set to `1E6` in order not to constrain the problem on high
        values.

        An optimization is performed per component in order to find the min and
        max curves. This is done by comparing the PDF value of a given curve
        with the band PDF.

        Parameters
        ----------
        band : array_like
            alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]``
        use_brute : bool
            Use the brute force optimizer instead of the default differential
            evolution to find the curves. Default is False.
        seed : {None, int, np.random.RandomState}
            Seed value to pass to scipy.optimize.differential_evolution. Can
            be an integer or RandomState instance. If None, then the default
            RandomState provided by np.random is used.


        Returns
        -------
        band_quantiles : list of 1-D array
            ``(max_quantile, min_quantile)`` (2, n_features)

        """
        min_pdf = pvalues[alpha.index(band[0])]
        try:
            max_pdf = pvalues[alpha.index(band[1])]
        except IndexError:
            max_pdf = 1E6
        band = [min_pdf, max_pdf]

        pool = Pool()
        data = zip(
            range(dim),
            itertools.repeat(
                (band, pca, bounds, ks_gaussian, seed, use_brute)))
        band_quantiles = pool.map(_min_max_band, data)
        pool.terminate()
        pool.close()

        band_quantiles = list(zip(*band_quantiles))

        return band_quantiles

    extra_alpha = [
        i for i in alpha if 0.5 != i and 0.9 != i and threshold != i
    ]
    if len(extra_alpha) > 0:
        extra_quantiles = []
        for x in extra_alpha:
            for y in _band_quantiles([x], use_brute=use_brute, seed=seed):
                extra_quantiles.append(y)
    else:
        extra_quantiles = []

    # Inverse transform from n-variate plot to dataset dataset's shape
    median = _inverse_transform(pca, median)[0]
    hdr_90 = _band_quantiles([0.9, 0.5], use_brute=use_brute, seed=seed)
    hdr_50 = _band_quantiles([0.5], use_brute=use_brute, seed=seed)

    hdr_res = HdrResults({
        "median": median,
        "hdr_50": hdr_50,
        "hdr_90": hdr_90,
        "extra_quantiles": extra_quantiles,
        "outliers": outliers,
        "outliers_idx": outliers_idx
    })

    # Plots
    ax.plot(np.array([xdata] * n_samples).T,
            data.T,
            c='c',
            alpha=.1,
            label=None)
    ax.plot(xdata, median, c='k', label='Median')
    fill_betweens = []
    fill_betweens.append(
        ax.fill_between(xdata,
                        *hdr_50,
                        color='gray',
                        alpha=.4,
                        label='50% HDR'))
    fill_betweens.append(
        ax.fill_between(xdata,
                        *hdr_90,
                        color='gray',
                        alpha=.3,
                        label='90% HDR'))

    if len(extra_quantiles) != 0:
        ax.plot(np.array([xdata] * len(extra_quantiles)).T,
                np.array(extra_quantiles).T,
                c='y',
                ls='-.',
                alpha=.4,
                label='Extra quantiles')

    if len(outliers) != 0:
        for ii, outlier in enumerate(outliers):
            if labels_outlier is None:
                label = 'Outliers'
            else:
                label = str(labels_outlier[ii])
            ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label)

    handles, labels = ax.get_legend_handles_labels()

    # Proxy artist for fill_between legend entry
    # See http://matplotlib.org/1.3.1/users/legend_guide.html
    plt = _import_mpl()
    for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens):
        p = plt.Rectangle((0, 0), 1, 1, fc=fill_between.get_facecolor()[0])
        handles.append(p)
        labels.append(label)

    by_label = OrderedDict(zip(labels, handles))
    if len(outliers) != 0:
        by_label.pop('Median')
        by_label.pop('50% HDR')
        by_label.pop('90% HDR')

    ax.legend(by_label.values(), by_label.keys(), loc='best')

    return fig, hdr_res
Пример #44
0
        ax.axhline(0, color='k')
        ax.set_ylim([-1, 1])

        # hack?
        ax.set_xlim([-1, xs[-1] + 1])

    mpl.rcParams['font.size'] = old_size

#Example TSA descriptive

data = sm.datasets.macrodata.load()
mdata = data.data
df = DataFrame.from_records(mdata)
quarter_end = frequencies.BQuarterEnd()
df.index = [quarter_end.rollforward(datetime(int(y), int(q) * 3, 1))
for y, q in zip(df.pop('year'), df.pop('quarter'))]
logged = np.log(df.ix[:, ['m1', 'realgdp', 'cpi']])
logged.plot(subplots=True)

log_difference = logged.diff().dropna()
plot_acf_multiple(log_difference.values)

#Example TSA VAR

model = tsa.VAR(log_difference, freq='D')
print(model.select_order())

res = model.fit(2)
print(res.summary())
print(res.is_stable())
        print('final branch with', tree, ''.join(tree), leavessum)  #sum(tree)
        if testxb:
            return leavessum  #sum(xb[tree])
        else:
            return ''.join(tree)  #sum(tree)

    print('working on branch', tree, branchsum)
    return branchsum


tree = [[0, 1], [[2, 3], [4, 5, 6]], [7]]
tree2 = ('top', [('B1', ['a', 'b']),
                 ('B2', [('B21', ['c', 'd']), ('B22', ['e', 'f', 'g'])]),
                 ('B3', ['h'])])

data2 = dict([i for i in zip('abcdefgh', lrange(8))])
#data2.update({'top':1000, 'B1':100, 'B2':200, 'B21':300,'B22':400, 'B3':400})
data2.update({
    'top': 1000,
    'B1': 100,
    'B2': 200,
    'B21': 21,
    'B22': 22,
    'B3': 300
})

#data2
#{'a': 0, 'c': 2, 'b': 1, 'e': 4, 'd': 3, 'g': 6, 'f': 5, 'h': 7,
#'top': 1000, 'B22': 22, 'B21': 21, 'B1': 100, 'B2': 200, 'B3': 300}

print('\n tree with dictionary data')
Пример #46
0
def grangercausalitytests(x, mxlg, autolag=None, alpha=0.0001, max_iter=1e5, addconst=True, verbose=True):
    """four tests for granger non causality of 2 timeseries

    all four tests give similar results
    `params_ftest` and `ssr_ftest` are equivalent based on F test which is
    identical to lmtest:grangertest in R

    Parameters
    ----------
    x : array, 2d
        data for test whether the time series in the second column Granger
        causes the time series in the first column
    lags : list of integers
        the Granger causality test results are calculated for all lags in the list
    autolag: If 'aic' the lag which minimizes the information criterion is used
             from the lags
    verbose : bool
        print results if true

    Returns
    -------
    results : dictionary
        all test results, dictionary keys are the number of lags. For each
        lag the values are a tuple, with the first element a dictionary with
        teststatistic, pvalues, degrees of freedom, the second element are
        the OLS estimation results for the restricted model, the unrestricted
        model and the restriction (contrast) matrix for the parameter f_test.

    Notes
    -----
    TODO: convert to class and attach results properly

    The Null hypothesis for grangercausalitytests is that the time series in
    the second column, x2, does NOT Granger cause the time series in the first
    column, x1. Grange causality means that past values of x2 have a
    statistically significant effect on the current value of x1, taking past
    values of x1 into account as regressors. We reject the null hypothesis
    that x2 does not Granger cause x1 if the pvalues are below a desired size
    of the test.

    The null hypothesis for all four test is that the coefficients
    corresponding to past values of the second time series are zero.

    'params_ftest', 'ssr_ftest' are based on F distribution

    'ssr_chi2test', 'lrtest' are based on chi-square distribution

    References
    ----------
    http://en.wikipedia.org/wiki/Granger_causality
    Greene: Econometric Analysis

    """
    from scipy import stats
    from sklearn.linear_model import Lasso
    
    x = np.asarray(x)

    if x.shape[0] <= 3 * mxlg + int(addconst):
        raise ValueError("Insufficient observations. Maximum allowable "
                         "lag is {0}".format(int((x.shape[0] - int(addconst)) /
                                                 3) - 1))
    result = {}
    if verbose:
        print('\nGranger Causality')
        print('number of lags (no zero)', mxlg)

    # create lagmat of both time series
    dta = lagmat2ds(x, mxlg, trim='both', dropex=1)

    #add constant
    if addconst:
        dtajoint = add_constant(dta[:, 1:], prepend=False)
    else:
        raise NotImplementedError('Not Implemented')

    # Run Lasso on all variables
    lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=max_iter)
    lassoreg.fit(dtajoint[:,1:], dta[:, 0])
    pred = lassoreg.predict(dtajoint[:,1:])
    actual = dta[:,0]
    errors = [abs(i-j) for i, j in zip(actual, pred)]
    step_size = 10
    window_size = 30
    avg_errors=[]
    for i in range(len(actual)/step_size):
        err = 0
        for j in range(i*step_size,(i*step_size)+window_size):
            if j>=len(actual):
                break
            err += errors[j]
        avg_errors += [err]
    rmse = np.mean(avg_errors)
    result = lassoreg.coef_
    non_zeros = [(i,x) for i, x in enumerate(result) if x != 0]
    non_zero_vars = {}
    best_vars = {}
    for (i,x) in non_zeros:
        k = (i+1)/(mxlg)
        if k not in non_zero_vars or abs(x) > abs(best_vars[k]):
            non_zero_vars[k] = (i+1)%(mxlg)
            best_vars[k] = x
    return (rmse , non_zero_vars, best_vars)
Пример #47
0
def contrast_product(names1, names2, intgroup1=None, intgroup2=None, pairs=False):
    '''build contrast matrices for products of two categorical variables

    this is an experimental script and should be converted to a class

    Parameters
    ----------
    names1, names2 : lists of strings
        contains the list of level labels for each categorical variable
    intgroup1, intgroup2 : ndarrays     TODO: this part not tested, finished yet
        categorical variable


    Notes
    -----
    This creates a full rank matrix. It does not do all pairwise comparisons,
    parameterization is using contrast_all_one to get differences with first
    level.

    ? does contrast_all_pairs work as a plugin to get all pairs ?

    '''

    n1 = len(names1)
    n2 = len(names2)
    names_prod = ['%s_%s' % (i,j) for i in names1 for j in names2]
    ee1 = np.zeros((1,n1))
    ee1[0,0] = 1
    if not pairs:
        dd = np.r_[ee1, -contrast_all_one(n1)]
    else:
        dd = np.r_[ee1, -contrast_allpairs(n1)]

    contrast_prod = np.kron(dd[1:], np.eye(n2))
    names_contrast_prod0 = contrast_labels(contrast_prod, names_prod, reverse=True)
    names_contrast_prod = [''.join(['%s%s' % (signstr(c, noplus=True),v)
                              for c,v in zip(row, names_prod)[::-1] if c != 0])
                                 for row in contrast_prod]

    ee2 = np.zeros((1,n2))
    ee2[0,0] = 1
    #dd2 = np.r_[ee2, -contrast_all_one(n2)]
    if not pairs:
        dd2 = np.r_[ee2, -contrast_all_one(n2)]
    else:
        dd2 = np.r_[ee2, -contrast_allpairs(n2)]

    contrast_prod2 = np.kron(np.eye(n1), dd2[1:])
    names_contrast_prod2 = [''.join(['%s%s' % (signstr(c, noplus=True),v)
                              for c,v in zip(row, names_prod)[::-1] if c != 0])
                                 for row in contrast_prod2]

    if (not intgroup1 is None) and (not intgroup1 is None):
        d1, _ = dummy_1d(intgroup1)
        d2, _ = dummy_1d(intgroup2)
        dummy = dummy_product(d1, d2)
    else:
        dummy = None

    return (names_prod, contrast_prod, names_contrast_prod,
                        contrast_prod2, names_contrast_prod2, dummy)
Пример #48
0
def summary_col(results, float_format='%.4f', model_names=[], stars=True,
                more_info=None, regressor_order=[],show='t',title=None): 
    # I added the parameter 'show' and changed the default of 'stars' into 'True',
    # then renamed the dict parameter 'info_dict' as a list one 'more_info'
    # finally assigned the regressor_order a initial value ['const']
    """
    Summarize multiple results instances side-by-side (coefs and SEs)

    Parameters
    ----------
    results : statsmodels results instance or list of result instances
    float_format : string
        float format for coefficients and standard errors
        Default : '%.4f'
    model_names : list of strings of length len(results) if the names are not
        unique, a roman number will be appended to all model names
    stars : bool
        print significance stars
    info_dict : dict
        dict of lambda functions to be applied to results instances to retrieve
        model info. To use specific information for different models, add a
        (nested) info_dict with model name as the key.
        Example: `info_dict = {"N":..., "R2": ..., "OLS":{"R2":...}}` would
        only show `R2` for OLS regression models, but additionally `N` for
        all other results.
        Default : None (use the info_dict specified in
        result.default_model_infos, if this property exists)
    regressor_order : list of strings
        list of names of the regressors in the desired order. All regressors
        not specified will be appended to the end of the list.
    """
    
    if not isinstance(results, list):
        results = [results]

    cols = [_col_params(x, stars=stars, float_format=float_format,show=show) for x in
            results]

    # Unique column names (pandas has problems merging otherwise)
    if model_names:
        colnames = _make_unique(model_names)
    else:
        colnames = _make_unique([x.columns[0] for x in cols])
    for i in range(len(cols)):
        cols[i].columns = [colnames[i]]

    merg = lambda x, y: x.merge(y, how='outer', right_index=True,
                                left_index=True)
    summ = reduce(merg, cols)

    # if regressor_order:
    if not regressor_order:
        regressor_order = ['const']
    
    varnames = summ.index.get_level_values(0).tolist()
    ordered = [x for x in regressor_order if x in varnames]
    unordered = [x for x in varnames if x not in regressor_order + ['']]
    
    # Note: np.unique can disrupt the original order  of list 'unordered'.
    # Then pd.Series().unique()  works well.
    
    # order = ordered + list(np.unique(unordered))
    order = ordered + list(pd.Series(unordered).unique())

    f = lambda idx: sum([[x + 'coef', x + 'stde'] for x in idx], [])
    # summ.index = f(np.unique(varnames))
    summ.index = f(pd.Series(varnames).unique())
    summ = summ.reindex(f(order))
    summ.index = [x[:-4] for x in summ.index]

    idx = pd.Series(lrange(summ.shape[0])) % 2 == 1
    summ.index = np.where(idx, '', summ.index.get_level_values(0))
    summ = summ.fillna('')
    
    # add infos about the models.
#     if info_dict:
#         cols = [_col_info(x, info_dict.get(x.model.__class__.__name__,
#                                            info_dict)) for x in results]
#     else:
#         cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in
#                 results]
    cols = [_col_info(x,more_info=more_info) for x in results]
    
    # use unique column names, otherwise the merge will not succeed
    for df , name in zip(cols, _make_unique([df.columns[0] for df in cols])):
        df.columns = [name]
    merg = lambda x, y: x.merge(y, how='outer', right_index=True,
                                left_index=True)
    info = reduce(merg, cols)
    info.columns = summ.columns
    info = info.fillna('')
#     dat = pd.DataFrame(np.vstack([summ, info]))  # pd.concat better, but error
#     dat.columns = summ.columns
#     dat.index = pd.Index(summ.index.tolist() + info.index.tolist())
#     summ = dat

#     summ = summ.fillna('')

#     smry = Summary()
#     smry.add_df(summ, header=True, align='l')
#     smry.add_text('Standard errors in parentheses.')
#     if stars:
#         smry.add_text('* p<.1, ** p<.05, ***p<.01')*p<.01')
#     return smry

    if show is 't':
        note = ['\t t statistics in parentheses.']
    if show is 'se':
        note = ['\t Std. error in parentheses.']
    if show is 'p':
        note = ['\t pvalues in parentheses.']
    if stars:
        note +=  ['\t * p<.1, ** p<.05, ***p<.01']
    #Here  I tried two ways to put extra text in index-location 
    # or columns-location,finally found the former is better.
#     note_df = pd.DataFrame(note,index=['note']+['']*(len(note)-1),columns=[summ.columns[0]])
    note_df = pd.DataFrame([ ],index=['note:']+note,columns=summ.columns).fillna('')
#     summ_all = pd.concat([summ,info,note_df],axis=0)
    
    # I construct a title DataFrame and adjust the location of title corresponding to the length of columns. 
    if title is not None:
        title = str(title)
    else:
        title = '\t Results Summary'
        
    # Here I tried to construct a title DataFrame and 
    # adjust the location of title corresponding to the length of columns. 
    # But I failed because of not good printing effect.

    # col_len = len(summ.columns)
    # fake_data = ['']*col_len
    # if col_len % 2 == 1:
    #     from math  import ceil
    #     i = ceil(col_len/2)
    # else:
    #     i = int(col_len/2)
    # fake_data[i-1] = title
    # title_df = pd.DataFrame([fake_data],index=[''],columns=summ.columns).fillna('')
    title_df = pd.DataFrame([],index=[title],columns=summ.columns).fillna('')
    
    smry = Summary()
    smry.add_df(title_df,header=False,align='l')
    smry.add_df(summ, header=True, align='l')
    smry.add_df(info, header=False, align='l')
    smry.add_df(note_df, header=False, align='l')
    return smry
Пример #49
0
    kde = KDE(x, kern)
    print(kde.density(np.matrix([1, 2])))  #.T
    plt.figure()
    plt.plot(x[:, 0], x[:, 1], 'o')

    n_grid = 50
    xsp = np.linspace(x.min(0)[0], x.max(0)[0], n_grid)
    ysp = np.linspace(x.min(0)[1], x.max(0)[1], n_grid)
    #    xsorted = np.sort(x)
    #    xlow = xsorted[nobs/4]
    #    xupp = xsorted[3*nobs/4]
    #    xsp = np.linspace(xlow[0], xupp[0], n_grid)
    #    ysp = np.linspace(xlow[1], xupp[1], n_grid)
    xr, yr = np.meshgrid(xsp, ysp)
    kde_vals = np.array([
        kde.density(np.matrix([xi, yi]))
        for xi, yi in zip(xr.ravel(), yr.ravel())
    ])
    plt.contour(xsp, ysp, kde_vals.reshape(n_grid, n_grid))

    plt.show()

    # 5 D case
#    random.seed(142)
#    mu = [1.0, 4.0, 3.5, -2.4, 0.0]
#    sigma = np.matrix(
#        [[ 0.6 - 0.1*abs(i-j) if i != j else 1.0 for j in xrange(5)] for i in xrange(5)])
#    x = random.multivariate_normal(mu, sigma, size = 100)
#    kern = kernel.Gaussian()
#    kde = KernelEstimate( x, kern )
Пример #50
0
def interaction_plot(x,
                     trace,
                     response,
                     func=np.mean,
                     ax=None,
                     plottype='b',
                     xlabel=None,
                     ylabel=None,
                     colors=None,
                     markers=None,
                     linestyles=None,
                     legendloc='best',
                     legendtitle=None,
                     **kwargs):
    """
    Interaction plot for factor level statistics.

    Note. If categorial factors are supplied levels will be internally
    recoded to integers. This ensures matplotlib compatiblity.

    uses pandas.DataFrame to calculate an `aggregate` statistic for each
    level of the factor or group given by `trace`.

    Parameters
    ----------
    x : array-like
        The `x` factor levels constitute the x-axis. If a `pandas.Series` is
        given its name will be used in `xlabel` if `xlabel` is None.
    trace : array-like
        The `trace` factor levels will be drawn as lines in the plot.
        If `trace` is a `pandas.Series` its name will be used as the
        `legendtitle` if `legendtitle` is None.
    response : array-like
        The reponse or dependent variable. If a `pandas.Series` is given
        its name will be used in `ylabel` if `ylabel` is None.
    func : function
        Anything accepted by `pandas.DataFrame.aggregate`. This is applied to
        the response variable grouped by the trace levels.
    plottype : str {'line', 'scatter', 'both'}, optional
        The type of plot to return. Can be 'l', 's', or 'b'
    ax : axes, optional
        Matplotlib axes instance
    xlabel : str, optional
        Label to use for `x`. Default is 'X'. If `x` is a `pandas.Series` it
        will use the series names.
    ylabel : str, optional
        Label to use for `response`. Default is 'func of response'. If
        `response` is a `pandas.Series` it will use the series names.
    colors : list, optional
        If given, must have length == number of levels in trace.
    linestyles : list, optional
        If given, must have length == number of levels in trace.
    markers : list, optional
        If given, must have length == number of lovels in trace
    kwargs
        These will be passed to the plot command used either plot or scatter.
        If you want to control the overall plotting options, use kwargs.

    Returns
    -------
    fig : Figure
        The figure given by `ax.figure` or a new instance.

    Examples
    --------
    >>> import numpy as np
    >>> np.random.seed(12345)
    >>> weight = np.random.randint(1,4,size=60)
    >>> duration = np.random.randint(1,3,size=60)
    >>> days = np.log(np.random.randint(1,30, size=60))
    >>> fig = interaction_plot(weight, duration, days,
    ...             colors=['red','blue'], markers=['D','^'], ms=10)
    >>> import matplotlib.pyplot as plt
    >>> plt.show()

    .. plot::

       import numpy as np
       from statsmodels.graphics.factorplots import interaction_plot
       np.random.seed(12345)
       weight = np.random.randint(1,4,size=60)
       duration = np.random.randint(1,3,size=60)
       days = np.log(np.random.randint(1,30, size=60))
       fig = interaction_plot(weight, duration, days,
                   colors=['red','blue'], markers=['D','^'], ms=10)
       import matplotlib.pyplot as plt
       #plt.show()
    """

    from pandas import DataFrame
    fig, ax = utils.create_mpl_ax(ax)

    response_name = ylabel or getattr(response, 'name', 'response')
    ylabel = '%s of %s' % (get_function_name(func), response_name)
    xlabel = xlabel or getattr(x, 'name', 'X')
    legendtitle = legendtitle or getattr(trace, 'name', 'Trace')

    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)

    x_values = x_levels = None
    if isinstance(x[0], str):
        x_levels = [l for l in np.unique(x)]
        x_values = lrange(len(x_levels))
        x = _recode(x, dict(zip(x_levels, x_values)))

    data = DataFrame(dict(x=x, trace=trace, response=response))
    plot_data = data.groupby(['trace', 'x']).aggregate(func).reset_index()

    # return data
    # check plot args
    n_trace = len(plot_data['trace'].unique())

    linestyles = ['-'] * n_trace if linestyles is None else linestyles
    markers = ['.'] * n_trace if markers is None else markers
    colors = rainbow(n_trace) if colors is None else colors

    if len(linestyles) != n_trace:
        raise ValueError("Must be a linestyle for each trace level")
    if len(markers) != n_trace:
        raise ValueError("Must be a marker for each trace level")
    if len(colors) != n_trace:
        raise ValueError("Must be a color for each trace level")

    if plottype == 'both' or plottype == 'b':
        for i, (values, group) in enumerate(plot_data.groupby(['trace'])):
            # trace label
            label = str(group['trace'].values[0])
            ax.plot(group['x'],
                    group['response'],
                    color=colors[i],
                    marker=markers[i],
                    label=label,
                    linestyle=linestyles[i],
                    **kwargs)
    elif plottype == 'line' or plottype == 'l':
        for i, (values, group) in enumerate(plot_data.groupby(['trace'])):
            # trace label
            label = str(group['trace'].values[0])
            ax.plot(group['x'],
                    group['response'],
                    color=colors[i],
                    label=label,
                    linestyle=linestyles[i],
                    **kwargs)
    elif plottype == 'scatter' or plottype == 's':
        for i, (values, group) in enumerate(plot_data.groupby(['trace'])):
            # trace label
            label = str(group['trace'].values[0])
            ax.scatter(group['x'],
                       group['response'],
                       color=colors[i],
                       label=label,
                       marker=markers[i],
                       **kwargs)

    else:
        raise ValueError("Plot type %s not understood" % plottype)
    ax.legend(loc=legendloc, title=legendtitle)
    ax.margins(.1)

    if all([x_levels, x_values]):
        ax.set_xticks(x_values)
        ax.set_xticklabels(x_levels)
    return fig