예제 #1
0
 def getrlm(self):
     self.k2 = self.results['k2']
     if isinstance(self.results['w'], dict):
         tmp = np.zeros((len(list(iterkeys(self.results['w'])))))
         for i in iterkeys(self.results['w']):
             tmp[int(i)-1] = self.results['w'][i]
         self.weights = tmp
     else: self.weights = self.results['w']
     self.stddev = self.rsum['stddev'] # Don't know what this is yet
     self.wresid = None # these equal resids always?
예제 #2
0
 def getrlm(self):
     self.k2 = self.results['k2']
     if isinstance(self.results['w'], dict):
         tmp = np.zeros((len(list(iterkeys(self.results['w'])))))
         for i in iterkeys(self.results['w']):
             tmp[int(i) - 1] = self.results['w'][i]
         self.weights = tmp
     else:
         self.weights = self.results['w']
     self.stddev = self.rsum['stddev']  # Don't know what this is yet
     self.wresid = None  # these equal resids always?
예제 #3
0
    def test_pickle_wrapper(self):

        fh = BytesIO()  # use cPickle with binary content

        # test unwrapped results load save pickle
        self.results._results.save(fh)
        fh.seek(0, 0)
        res_unpickled = self.results._results.__class__.load(fh)
        assert_(type(res_unpickled) is type(self.results._results))

        # test wrapped results load save
        fh.seek(0, 0)
        self.results.save(fh)
        fh.seek(0, 0)
        res_unpickled = self.results.__class__.load(fh)
        fh.close()
        # print type(res_unpickled)
        assert_(type(res_unpickled) is type(self.results))

        before = sorted(iterkeys(self.results.__dict__))
        after = sorted(iterkeys(res_unpickled.__dict__))
        assert_(before == after, msg='not equal %r and %r' % (before, after))

        before = sorted(iterkeys(self.results._results.__dict__))
        after = sorted(iterkeys(res_unpickled._results.__dict__))
        assert_(before == after, msg='not equal %r and %r' % (before, after))

        before = sorted(iterkeys(self.results.model.__dict__))
        after = sorted(iterkeys(res_unpickled.model.__dict__))
        assert_(before == after, msg='not equal %r and %r' % (before, after))

        before = sorted(iterkeys(self.results._cache))
        after = sorted(iterkeys(res_unpickled._cache))
        assert_(before == after, msg='not equal %r and %r' % (before, after))
예제 #4
0
    def test_pickle_wrapper(self):

        fh = BytesIO()  # use pickle with binary content

        # test unwrapped results load save pickle
        self.results._results.save(fh)
        fh.seek(0, 0)
        res_unpickled = self.results._results.__class__.load(fh)
        assert type(res_unpickled) is type(self.results._results)  # noqa: E721

        # test wrapped results load save
        fh.seek(0, 0)
        self.results.save(fh)
        fh.seek(0, 0)
        res_unpickled = self.results.__class__.load(fh)
        fh.close()
        assert type(res_unpickled) is type(self.results)  # noqa: E721

        before = sorted(iterkeys(self.results.__dict__))
        after = sorted(iterkeys(res_unpickled.__dict__))
        assert_(before == after, msg='not equal %r and %r' % (before, after))

        before = sorted(iterkeys(self.results._results.__dict__))
        after = sorted(iterkeys(res_unpickled._results.__dict__))
        assert_(before == after, msg='not equal %r and %r' % (before, after))

        before = sorted(iterkeys(self.results.model.__dict__))
        after = sorted(iterkeys(res_unpickled.model.__dict__))
        assert_(before == after, msg='not equal %r and %r' % (before, after))

        before = sorted(iterkeys(self.results._cache))
        after = sorted(iterkeys(res_unpickled._cache))
        assert_(before == after, msg='not equal %r and %r' % (before, after))
예제 #5
0
    def score(self, b, ties='breslow'):

        score = 0
        for t in iterkeys(self.failures):
            fail = self.failures[t]
            d = len(fail)
            risk = self.risk[t]
            Z = self.design[t]

            score += Z[fail].sum()

            if ties == 'breslow':
                w = np.exp(np.dot(Z, b))
                rv = Discrete(Z[risk], w=w[risk])
                score -= rv.mean() * d
            elif ties == 'efron':
                w = np.exp(np.dot(Z, b))
                score += Z[fail].sum()
                for j in range(d):
                    efron_w = w
                    efron_w[fail] -= i * w[fail] / float(d)
                    rv = Discrete(Z[risk], w=efron_w[risk])
                    score -= rv.mean()
            elif ties == 'cox':
                raise NotImplementedError('Cox tie breaking method not \
implemented')
            else:
                raise NotImplementedError('tie breaking method not recognized')
        return np.array([score])
예제 #6
0
    def loglike(self, b, ties='breslow'):

        logL = 0
        for t in iterkeys(self.failures):
            fail = self.failures[t]
            d = len(fail)
            risk = self.risk[t]
            Zb = np.dot(self.design[t], b)

            logL += Zb[fail].sum()

            if ties == 'breslow':
                s = np.exp(Zb[risk]).sum()
                logL -= np.log(np.exp(Zb[risk]).sum()) * d
            elif ties == 'efron':
                s = np.exp(Zb[risk]).sum()
                r = np.exp(Zb[fail]).sum()
                for j in range(d):
                    logL -= np.log(s - j * r / d)
            elif ties == 'cox':
                raise NotImplementedError('Cox tie breaking method not \
implemented')
            else:
                raise NotImplementedError('tie breaking method not recognized')
        return logL
예제 #7
0
    def cache(self):
        if self.time_dependent:
            self.cachedir = tempfile.mkdtemp()

        self.design = {}
        self.risk = {}
        first = True

        for t in iterkeys(self.failures):
            if self.time_dependent:
                d = np.array([s(self.formula, time=t)
                             for s in self.subjects]).astype(float)[:,None]
                dshape = d.shape
                dfile = file(tempfile.mkstemp(dir=self.cachedir)[1], 'w')
                d.tofile(dfile)
                dfile.close()
                del(d)
                self.design[t] = np.memmap(dfile.name,
                                          dtype=np.dtype(float),
                                          shape=dshape)
            elif first:
                d = np.array([s(self.formula, time=t)
                             for s in self.subjects]).astype(np.float64)
                self.design[t] = d
            else:
                self.design[t] = d
            self.risk[t] = np.compress([s.atrisk(t) for s in self.subjects],
                                      np.arange(self.design[t].shape[0]),axis=-1)
예제 #8
0
def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            result = dmatrices(formula, (Y, X),
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula, (Y, X),
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)
    else:
        if data_util._is_using_pandas(Y, None):
            result = dmatrices(formula,
                               Y,
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula,
                               Y,
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    return result, missing_mask
예제 #9
0
    def information(self, b, ties='breslow'):

        info = 0 #np.zeros((len(b),len(b))) #0
        score = 0
        for t in iterkeys(self.failures):
            fail = self.failures[t]
            d = len(fail)
            risk = self.risk[t]
            Z = self.design[t]

            if ties == 'breslow':
                w = np.exp(np.dot(Z, b))
                rv = Discrete(Z[risk], w=w[risk])
                info += rv.cov()
            elif ties == 'efron':
                w = np.exp(np.dot(Z, b))
                score += Z[fail].sum()
                for j in range(d):
                    efron_w = w
                    efron_w[fail] -= i * w[fail] / d
                    rv = Discrete(Z[risk], w=efron_w[risk])
                    info += rv.cov()
            elif ties == 'cox':
                raise NotImplementedError('Cox tie breaking method not \
implemented')
            else:
                raise NotImplementedError('tie breaking method not recognized')
        return score
예제 #10
0
    def score(self, b, ties='breslow'):

        score = 0
        for t in iterkeys(self.failures):
            fail = self.failures[t]
            d = len(fail)
            risk = self.risk[t]
            Z = self.design[t]

            score += Z[fail].sum()

            if ties == 'breslow':
                w = np.exp(np.dot(Z, b))
                rv = Discrete(Z[risk], w=w[risk])
                score -= rv.mean() * d
            elif ties == 'efron':
                w = np.exp(np.dot(Z, b))
                score += Z[fail].sum()
                for j in range(d):
                    efron_w = w
                    efron_w[fail] -= i * w[fail] / float(d)
                    rv = Discrete(Z[risk], w=efron_w[risk])
                    score -= rv.mean()
            elif ties == 'cox':
                raise NotImplementedError('Cox tie breaking method not \
implemented')
            else:
                raise NotImplementedError('tie breaking method not recognized')
        return np.array([score])
예제 #11
0
    def loglike(self, b, ties='breslow'):

        logL = 0
        for t in iterkeys(self.failures):
            fail = self.failures[t]
            d = len(fail)
            risk = self.risk[t]
            Zb = np.dot(self.design[t], b)

            logL += Zb[fail].sum()

            if ties == 'breslow':
                s = np.exp(Zb[risk]).sum()
                logL -= np.log(np.exp(Zb[risk]).sum()) * d
            elif ties == 'efron':
                s = np.exp(Zb[risk]).sum()
                r = np.exp(Zb[fail]).sum()
                for j in range(d):
                    logL -= np.log(s - j * r / d)
            elif ties == 'cox':
                raise NotImplementedError('Cox tie breaking method not \
implemented')
            else:
                raise NotImplementedError('tie breaking method not recognized')
        return logL
예제 #12
0
    def cache(self):
        if self.time_dependent:
            self.cachedir = tempfile.mkdtemp()

        self.design = {}
        self.risk = {}
        first = True

        for t in iterkeys(self.failures):
            if self.time_dependent:
                d = np.array([s(self.formula, time=t)
                             for s in self.subjects]).astype(float)[:,None]
                dshape = d.shape
                dfile = file(tempfile.mkstemp(dir=self.cachedir)[1], 'w')
                d.tofile(dfile)
                dfile.close()
                del(d)
                self.design[t] = np.memmap(dfile.name,
                                          dtype=np.dtype(float),
                                          shape=dshape)
            elif first:
                d = np.array([s(self.formula, time=t)
                             for s in self.subjects]).astype(np.float64)
                self.design[t] = d
            else:
                self.design[t] = d
            self.risk[t] = np.compress([s.atrisk(t) for s in self.subjects],
                                      np.arange(self.design[t].shape[0]),axis=-1)
예제 #13
0
    def information(self, b, ties='breslow'):

        info = 0 #np.zeros((len(b),len(b))) #0
        score = 0
        for t in iterkeys(self.failures):
            fail = self.failures[t]
            d = len(fail)
            risk = self.risk[t]
            Z = self.design[t]

            if ties == 'breslow':
                w = np.exp(np.dot(Z, b))
                rv = Discrete(Z[risk], w=w[risk])
                info += rv.cov()
            elif ties == 'efron':
                w = np.exp(np.dot(Z, b))
                score += Z[fail].sum()
                for j in range(d):
                    efron_w = w
                    efron_w[fail] -= i * w[fail] / d
                    rv = Discrete(Z[risk], w=efron_w[risk])
                    info += rv.cov()
            elif ties == 'cox':
                raise NotImplementedError('Cox tie breaking method not \
implemented')
            else:
                raise NotImplementedError('tie breaking method not recognized')
        return score
예제 #14
0
    def add_dict(self, d, ncols=2, align='l', float_format="%.4f"):
        '''Add the contents of a Dict to summary table

        Parameters
        ----------
        d : dict
            Keys and values are automatically coerced to strings with str().
            Users are encouraged to format them before using add_dict.
        ncols: int
            Number of columns of the output table
        align : string
            Data alignment (l/c/r)
        '''

        keys = [_formatter(x, float_format) for x in iterkeys(d)]
        vals = [_formatter(x, float_format) for x in itervalues(d)]
        data = np.array(lzip(keys, vals))

        if data.shape[0] % ncols != 0:
            pad = ncols - (data.shape[0] % ncols)
            data = np.vstack([data, np.array(pad * [['', '']])])

        data = np.split(data, ncols)
        data = reduce(lambda x, y: np.hstack([x, y]), data)
        self.add_array(data, align=align)
예제 #15
0
    def add_dict(self, d, ncols=2, align='l', float_format="%.4f"):
        '''Add the contents of a Dict to summary table

        Parameters
        ----------
        d : dict
            Keys and values are automatically coerced to strings with str().
            Users are encouraged to format them before using add_dict.
        ncols: int
            Number of columns of the output table
        align : str
            Data alignment (l/c/r)
        '''

        keys = [_formatter(x, float_format) for x in iterkeys(d)]
        vals = [_formatter(x, float_format) for x in itervalues(d)]
        data = np.array(lzip(keys, vals))

        if data.shape[0] % ncols != 0:
            pad = ncols - (data.shape[0] % ncols)
            data = np.vstack([data, np.array(pad * [['', '']])])

        data = np.split(data, ncols)
        data = reduce(lambda x, y: np.hstack([x, y]), data)
        self.add_array(data, align=align)
예제 #16
0
def test_recursive_split():
    keys = list(product('mf'))
    data = OrderedDict(zip(keys, [1] * len(keys)))
    res = _hierarchical_split(data, gap=0)
    assert_(list(iterkeys(res)) == keys)
    res[('m',)] = (0.0, 0.0, 0.5, 1.0)
    res[('f',)] = (0.5, 0.0, 0.5, 1.0)
    keys = list(product('mf', 'yao'))
    data = OrderedDict(zip(keys, [1] * len(keys)))
    res = _hierarchical_split(data, gap=0)
    assert_(list(iterkeys(res)) == keys)
    res[('m', 'y')] = (0.0, 0.0, 0.5, 1 / 3)
    res[('m', 'a')] = (0.0, 1 / 3, 0.5, 1 / 3)
    res[('m', 'o')] = (0.0, 2 / 3, 0.5, 1 / 3)
    res[('f', 'y')] = (0.5, 0.0, 0.5, 1 / 3)
    res[('f', 'a')] = (0.5, 1 / 3, 0.5, 1 / 3)
    res[('f', 'o')] = (0.5, 2 / 3, 0.5, 1 / 3)
예제 #17
0
def test_recursive_split():
    keys = list(product('mf'))
    data = OrderedDict(zip(keys, [1] * len(keys)))
    res = _hierarchical_split(data, gap=0)
    assert_(list(iterkeys(res)) == keys)
    res[('m', )] = (0.0, 0.0, 0.5, 1.0)
    res[('f', )] = (0.5, 0.0, 0.5, 1.0)
    keys = list(product('mf', 'yao'))
    data = OrderedDict(zip(keys, [1] * len(keys)))
    res = _hierarchical_split(data, gap=0)
    assert_(list(iterkeys(res)) == keys)
    res[('m', 'y')] = (0.0, 0.0, 0.5, 1 / 3)
    res[('m', 'a')] = (0.0, 1 / 3, 0.5, 1 / 3)
    res[('m', 'o')] = (0.0, 2 / 3, 0.5, 1 / 3)
    res[('f', 'y')] = (0.5, 0.0, 0.5, 1 / 3)
    res[('f', 'a')] = (0.5, 1 / 3, 0.5, 1 / 3)
    res[('f', 'o')] = (0.5, 2 / 3, 0.5, 1 / 3)
예제 #18
0
def test__key_splitting():
    # subdivide starting with an empty tuple
    base_rect = {tuple(): (0, 0, 1, 1)}
    res = _key_splitting(base_rect, ['a', 'b'], [1, 1], tuple(), True, 0)
    assert_(list(iterkeys(res)) == [('a',), ('b',)])
    eq(res[('a',)], (0, 0, 0.5, 1))
    eq(res[('b',)], (0.5, 0, 0.5, 1))
    # subdivide a in two sublevel
    res_bis = _key_splitting(res, ['c', 'd'], [1, 1], ('a',), False, 0)
    assert_(list(iterkeys(res_bis)) == [('a', 'c'), ('a', 'd'), ('b',)])
    eq(res_bis[('a', 'c')], (0.0, 0.0, 0.5, 0.5))
    eq(res_bis[('a', 'd')], (0.0, 0.5, 0.5, 0.5))
    eq(res_bis[('b',)], (0.5, 0, 0.5, 1))
    # starting with a non empty tuple and uneven distribution
    base_rect = {('total',): (0, 0, 1, 1)}
    res = _key_splitting(base_rect, ['a', 'b'], [1, 2], ('total',), True, 0)
    assert_(list(iterkeys(res)) == [('total',) + (e,) for e in ['a', 'b']])
    eq(res[('total', 'a')], (0, 0, 1 / 3, 1))
    eq(res[('total', 'b')], (1 / 3, 0, 2 / 3, 1))
예제 #19
0
def test__key_splitting():
    # subdivide starting with an empty tuple
    base_rect = {tuple(): (0, 0, 1, 1)}
    res = _key_splitting(base_rect, ['a', 'b'], [1, 1], tuple(), True, 0)
    assert_(list(iterkeys(res)) == [('a', ), ('b', )])
    eq(res[('a', )], (0, 0, 0.5, 1))
    eq(res[('b', )], (0.5, 0, 0.5, 1))
    # subdivide a in two sublevel
    res_bis = _key_splitting(res, ['c', 'd'], [1, 1], ('a', ), False, 0)
    assert_(list(iterkeys(res_bis)) == [('a', 'c'), ('a', 'd'), ('b', )])
    eq(res_bis[('a', 'c')], (0.0, 0.0, 0.5, 0.5))
    eq(res_bis[('a', 'd')], (0.0, 0.5, 0.5, 0.5))
    eq(res_bis[('b', )], (0.5, 0, 0.5, 1))
    # starting with a non empty tuple and uneven distribution
    base_rect = {('total', ): (0, 0, 1, 1)}
    res = _key_splitting(base_rect, ['a', 'b'], [1, 2], ('total', ), True, 0)
    assert_(list(iterkeys(res)) == [('total', ) + (e, ) for e in ['a', 'b']])
    eq(res[('total', 'a')], (0, 0, 1 / 3, 1))
    eq(res[('total', 'b')], (1 / 3, 0, 2 / 3, 1))
예제 #20
0
    def verify(self):
        '''load the saved module and verify the data

        This tries several ways of comparing the saved and the attached data,
        but might not work for all possible data structures.

        Returns
        -------
        all_correct : bool
            true if no differences are found, for floating point numbers
            rtol=1e-16, atol=1e-16 is used to determine equality (allclose)
        correctli : list
            list of attribute names that compare as equal
        incorrectli : list
            list of attribute names that did not compare as equal, either
            because they differ or because the comparison does not handle the
            data structure correctly

        '''
        module = __import__(self._filename.replace('.py', ''))
        if not self._useinstance:
            raise NotImplementedError('currently only implemented when'
                                      'useinstance is true')
        data = getattr(module, self.name)
        correctli = []
        incorrectli = []

        for d in self._what:
            self_item = getattr(data, d)
            saved_item = getattr(data, d)
            #print(d)
            #try simple equality
            correct = np.all(self.item == saved_item)
            #try allclose
            if not correct and not self.item.dtype == np.dtype('object'):
                correct = np.allclose(self_item,
                                      saved_item,
                                      rtol=1e-16,
                                      atol=1e-16)
                if not correct:
                    import warnings
                    warnings.warn("inexact precision in " + d, RuntimeWarning)
            #try iterating, if object array
            if not correct:
                correlem = [
                    np.all(data[d].item()[k] == getattr(
                        testsave.var_results, d).item()[k])
                    for k in iterkeys(data[d].item())
                ]
                if not correlem:
                    #print(d, "wrong")
                    incorrectli.append(d)
            correctli.append(d)

        return len(incorrectli) == 0, correctli, incorrectli
예제 #21
0
def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
        else:
            result = dmatrices(formula, (Y, X), depth,
                               return_type='dataframe', NA_action=na_action)
    else:
        if data_util._is_using_pandas(Y, None):
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula, Y, depth, return_type='dataframe',
                               NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    if len(result) > 1:  # have RHS design
        design_info = result[1].design_info  # detach it from DataFrame
    else:
        design_info = None
    # NOTE: is there ever a case where we'd need LHS design_info?
    return result, missing_mask, design_info
예제 #22
0
def interactions(terms, order=[1, 2]):
    """
    Output all pairwise interactions of given order of a
    sequence of terms.

    The argument order is a sequence specifying which order
    of interactions should be generated -- the default
    creates main effects and two-way interactions. If order
    is an integer, it is changed to range(1,order+1), so
    order=3 is equivalent to order=[1,2,3], generating
    all one, two and three-way interactions.

    If any entry of order is greater than len(terms), it is
    effectively treated as len(terms).

    >>> print interactions([Term(l) for l in ['a', 'b', 'c']])
    <formula: a*b + a*c + b*c + a + b + c>
    >>>
    >>> print interactions([Term(l) for l in ['a', 'b', 'c']], order=list(range(5)))
    <formula: a*b + a*b*c + a*c + b*c + a + b + c>
    >>>

    """
    n_terms = len(terms)

    values = {}

    if np.asarray(order).shape == ():
        order = lrange(1, int(order) + 1)

    # First order

    for o in order:
        indices = np.indices((n_terms, ) * (o))
        indices.shape = (indices.shape[0], np.product(indices.shape[1:]))
        for m in range(indices.shape[1]):

            # only keep combinations that have unique entries

            if (np.unique(indices[:, m]).shape == indices[:, m].shape
                    and np.alltrue(
                        np.equal(np.sort(indices[:, m]), indices[:, m]))):
                ll = [terms[j] for j in indices[:, m]]
                v = ll[0]
                for ii in range(len(ll) - 1):
                    v *= ll[ii + 1]
                values[tuple(indices[:, m])] = v

    key = list(iterkeys(values))[0]
    value = values[key]
    del (values[key])

    for v in itervalues(values):
        value += v
    return value
예제 #23
0
def interactions(terms, order=[1,2]):
    """
    Output all pairwise interactions of given order of a
    sequence of terms.

    The argument order is a sequence specifying which order
    of interactions should be generated -- the default
    creates main effects and two-way interactions. If order
    is an integer, it is changed to range(1,order+1), so
    order=3 is equivalent to order=[1,2,3], generating
    all one, two and three-way interactions.

    If any entry of order is greater than len(terms), it is
    effectively treated as len(terms).

    >>> print interactions([Term(l) for l in ['a', 'b', 'c']])
    <formula: a*b + a*c + b*c + a + b + c>
    >>>
    >>> print interactions([Term(l) for l in ['a', 'b', 'c']], order=list(range(5)))
    <formula: a*b + a*b*c + a*c + b*c + a + b + c>
    >>>

    """
    l = len(terms)

    values = {}

    if np.asarray(order).shape == ():
        order = lrange(1, int(order)+1)

    # First order

    for o in order:
        I = np.indices((l,)*(o))
        I.shape = (I.shape[0], np.product(I.shape[1:]))
        for m in range(I.shape[1]):

            # only keep combinations that have unique entries

            if (np.unique(I[:,m]).shape == I[:,m].shape and
                    np.alltrue(np.equal(np.sort(I[:,m]), I[:,m]))):
                ll = [terms[j] for j in I[:,m]]
                v = ll[0]
                for ii in range(len(ll)-1):
                    v *= ll[ii+1]
                values[tuple(I[:,m])] = v

    key = list(iterkeys(values))[0]
    value = values[key]
    del(values[key])

    for v in itervalues(values):
        value += v
    return value
예제 #24
0
    def __init__(self, sys, indep_endog=None, instruments=None):
        if len(sys) % 2 != 0:
            raise ValueError("sys must be a list of pairs of endogenous and \
exogenous variables.  Got length %s" % len(sys))
        M = len(sys[1::2])
        self._M = M
        # The lists are probably a bad idea
        self.endog = sys[::2]  # these are just list containers
        self.exog = sys[1::2]
        self._K = [np.linalg.matrix_rank(_) for _ in sys[1::2]]
        #        fullexog = np.column_stack((_ for _ in self.exog))

        self.instruments = instruments

        # Keep the Y_j's in a container to get IVs
        instr_endog = {}
        [instr_endog.setdefault(_, []) for _ in iterkeys(indep_endog)]

        for eq_key in indep_endog:
            for varcol in indep_endog[eq_key]:
                instr_endog[eq_key].append(self.exog[eq_key][:, varcol])
                # ^ copy needed?
#        self._instr_endog = instr_endog

        self._indep_endog = indep_endog
        _col_map = np.cumsum(np.hstack((0, self._K)))  # starting col no.s
        # move this check to whiten since we're not going to build a full exog?
        for eq_key in indep_endog:
            try:
                iter(indep_endog[eq_key])
            except:
                #                eq_key = [eq_key]
                raise TypeError("The values of the indep_exog dict must be "
                                "iterable. Got type %s for converter %s" %
                                (type(indep_endog[eq_key]), eq_key))


#            for del_col in indep_endog[eq_key]:
#                fullexog = np.delete(fullexog,  _col_map[eq_key]+del_col, 1)
#                _col_map[eq_key+1:] -= 1

# Josef's example for deleting reoccuring "rows"
#        fullexog = np.unique(fullexog.T.view([('',fullexog.dtype)]*\
#                fullexog.shape[0])).view(fullexog.dtype).reshape(\
#                fullexog.shape[0],-1)
# From http://article.gmane.org/gmane.comp.python.numeric.general/32276/
# Or Jouni' suggetsion of taking a hash:
# http://www.mail-archive.com/[email protected]/msg04209.html
# not clear to me how this would work though, only if they are the *same*
# elements?
#        self.fullexog = fullexog
        self.wexog = self.whiten(instr_endog)
예제 #25
0
    def observed_crude_oddsratio(self):
        """The crude odds ratio is obtained by pooling all data
        corresponding to a given pair of cut points (c,c'), then
        forming the inverse variance weighted average of these odds
        ratios to obtain a single OR.  Since the covariate effects are
        ignored, this OR will generally be greater than the stratified
        OR.
        """

        cpp = self.cpp
        endog = self.model.endog_li

        # Storage for the contingency tables for each (c,c')
        tables = {}
        for ii in iterkeys(cpp[0]):
            tables[ii] = np.zeros((2, 2), dtype=np.float64)

        # Get the observed crude OR
        for i in range(len(endog)):

            if len(endog[i]) == 0:
                continue

            # The observed joint values for the current cluster
            yvec = endog[i]
            endog_11 = np.outer(yvec, yvec)
            endog_10 = np.outer(yvec, 1 - yvec)
            endog_01 = np.outer(1 - yvec, yvec)
            endog_00 = np.outer(1 - yvec, 1 - yvec)

            cpp1 = cpp[i]
            for ky in iterkeys(cpp1):
                ix = cpp1[ky]
                tables[ky][1, 1] += endog_11[ix[:, 0], ix[:, 1]].sum()
                tables[ky][1, 0] += endog_10[ix[:, 0], ix[:, 1]].sum()
                tables[ky][0, 1] += endog_01[ix[:, 0], ix[:, 1]].sum()
                tables[ky][0, 0] += endog_00[ix[:, 0], ix[:, 1]].sum()

        return self.pooled_odds_ratio(list(itervalues(tables)))
예제 #26
0
 def getglm(self):
     self.deviance = self.rsum['deviance']
     self.resid = [self.results['residuals'][str(k)] \
             for k in range(1, 1+self.nobs)]
     if isinstance(self.resid, dict):
         tmp = np.zeros(len(self.resid))
         for i in iterkeys(self.resid):
             tmp[int(i)-1] = self.resid[i]
         self.resid = tmp
     self.predict = [self.results['linear.predictors'][str(k)] \
             for k in range(1, 1+self.nobs)]
     self.fittedvalues = [self.results['fitted.values'][str(k)] \
             for k in range(1, 1+self.nobs)]
     self.weights = [self.results['weights'][str(k)] \
             for k in range(1, 1+self.nobs)]
     self.resid_deviance = self.rsum['deviance.resid']
     if isinstance(self.resid_deviance, dict):
         tmp = np.zeros(len(self.resid_deviance))
         for i in iterkeys(self.resid_deviance):
             tmp[int(i)-1] = self.resid_deviance[i]
         self.resid_deviance = tmp
     self.null_deviance = self.rsum['null.deviance']
예제 #27
0
    def initialize(self, subjects):
        print('called initialize')
        self.failures = {}
        for i in range(len(subjects)):
            s = subjects[i]
            if s.delta:
                if s.time not in self.failures:
                    self.failures[s.time] = [i]
                else:
                    self.failures[s.time].append(i)

        self.failure_times = list(iterkeys(self.failures))
        self.failure_times.sort()
예제 #28
0
    def initialize(self, subjects):
        print('called initialize')
        self.failures = {}
        for i in range(len(subjects)):
            s = subjects[i]
            if s.delta:
                if s.time not in self.failures:
                    self.failures[s.time] = [i]
                else:
                    self.failures[s.time].append(i)

        self.failure_times = list(iterkeys(self.failures))
        self.failure_times.sort()
예제 #29
0
 def getglm(self):
     self.deviance = self.rsum['deviance']
     self.resid = [self.results['residuals'][str(k)] \
             for k in range(1, 1+self.nobs)]
     if isinstance(self.resid, dict):
         tmp = np.zeros(len(self.resid))
         for i in iterkeys(self.resid):
             tmp[int(i) - 1] = self.resid[i]
         self.resid = tmp
     self.predict = [self.results['linear.predictors'][str(k)] \
             for k in range(1, 1+self.nobs)]
     self.fittedvalues = [self.results['fitted.values'][str(k)] \
             for k in range(1, 1+self.nobs)]
     self.weights = [self.results['weights'][str(k)] \
             for k in range(1, 1+self.nobs)]
     self.resid_deviance = self.rsum['deviance.resid']
     if isinstance(self.resid_deviance, dict):
         tmp = np.zeros(len(self.resid_deviance))
         for i in iterkeys(self.resid_deviance):
             tmp[int(i) - 1] = self.resid_deviance[i]
         self.resid_deviance = tmp
     self.null_deviance = self.rsum['null.deviance']
예제 #30
0
    def observed_crude_oddsratio(self):
        """The crude odds ratio is obtained by pooling all data
        corresponding to a given pair of cut points (c,c'), then
        forming the inverse variance weighted average of these odds
        ratios to obtain a single OR.  Since the covariate effects are
        ignored, this OR will generally be greater than the stratified
        OR.
        """

        cpp = self.cpp
        endog = self.model.endog_li

        # Storage for the contingency tables for each (c,c')
        tables = {}
        for ii in iterkeys(cpp[0]):
            tables[ii] = np.zeros((2, 2), dtype=np.float64)

        # Get the observed crude OR
        for i in range(len(endog)):

            if len(endog[i]) == 0:
                continue

            # The observed joint values for the current cluster
            yvec = endog[i]
            endog_11 = np.outer(yvec, yvec)
            endog_10 = np.outer(yvec, 1 - yvec)
            endog_01 = np.outer(1 - yvec, yvec)
            endog_00 = np.outer(1 - yvec, 1 - yvec)

            cpp1 = cpp[i]
            for ky in iterkeys(cpp1):
                ix = cpp1[ky]
                tables[ky][1, 1] += endog_11[ix[:, 0], ix[:, 1]].sum()
                tables[ky][1, 0] += endog_10[ix[:, 0], ix[:, 1]].sum()
                tables[ky][0, 1] += endog_01[ix[:, 0], ix[:, 1]].sum()
                tables[ky][0, 0] += endog_00[ix[:, 0], ix[:, 1]].sum()

        return self.pooled_odds_ratio(list(itervalues(tables)))
예제 #31
0
    def verify(self):
        '''load the saved module and verify the data

        This tries several ways of comparing the saved and the attached data,
        but might not work for all possible data structures.

        Returns
        -------
        all_correct : bool
            true if no differences are found, for floating point numbers
            rtol=1e-16, atol=1e-16 is used to determine equality (allclose)
        correctli : list
            list of attribute names that compare as equal
        incorrectli : list
            list of attribute names that did not compare as equal, either
            because they differ or because the comparison does not handle the
            data structure correctly

        '''
        module = __import__(self._filename.replace('.py',''))
        if not self._useinstance:
            raise NotImplementedError('currently only implemented when'
                                      'useinstance is true')
        data = getattr(module, self.name)
        correctli = []
        incorrectli = []

        for d in self._what:
            self_item = getattr(data, d)
            saved_item = getattr(data, d)
            #print(d)
            #try simple equality
            correct = np.all(self.item == saved_item)
            #try allclose
            if not correct and not self.item.dtype == np.dtype('object'):
                correct = np.allclose(self_item, saved_item,
                                  rtol=1e-16, atol=1e-16)
                if not correct:
                    import warnings
                    warnings.warn("inexact precision in "+d, RuntimeWarning)
            #try iterating, if object array
            if not correct:
                correlem =[np.all(data[d].item()[k] ==
                                  getattr(testsave.var_results, d).item()[k])
                           for k in iterkeys(data[d].item())]
                if not correlem:
                    #print(d, "wrong")
                    incorrectli.append(d)
            correctli.append(d)

        return len(incorrectli)==0, correctli, incorrectli
예제 #32
0
    def test_all_to_tbl(self):
        from statsmodels.stats.libqsturng.make_tbls import T,R
        ps, rs, vs, qs = [], [], [], []
        for p in T:
            for v in T[p]:
                for r in iterkeys(R):
                    ps.append(p)
                    vs.append(v)
                    rs.append(r)
                    qs.append(T[p][v][R[r]])

        qs = np.array(qs)
        errors = np.abs(qs-qsturng(ps,rs,vs))/qs
        assert_equal(np.array([]), np.where(errors > .03)[0])
예제 #33
0
    def __init__(self, sys, indep_endog=None, instruments=None):
        if len(sys) % 2 != 0:
            raise ValueError("sys must be a list of pairs of endogenous and \
exogenous variables.  Got length %s" % len(sys))
        M = len(sys[1::2])
        self._M = M
# The lists are probably a bad idea
        self.endog = sys[::2]   # these are just list containers
        self.exog = sys[1::2]
        self._K = [np.linalg.matrix_rank(_) for _ in sys[1::2]]
#        fullexog = np.column_stack((_ for _ in self.exog))

        self.instruments = instruments

        # Keep the Y_j's in a container to get IVs
        instr_endog = {}
        [instr_endog.setdefault(_,[]) for _ in iterkeys(indep_endog)]

        for eq_key in indep_endog:
            for varcol in indep_endog[eq_key]:
                instr_endog[eq_key].append(self.exog[eq_key][:,varcol])
                # ^ copy needed?
#        self._instr_endog = instr_endog

        self._indep_endog = indep_endog
        _col_map = np.cumsum(np.hstack((0,self._K))) # starting col no.s
# move this check to whiten since we're not going to build a full exog?
        for eq_key in indep_endog:
            try:
                iter(indep_endog[eq_key])
            except:
#                eq_key = [eq_key]
                raise TypeError("The values of the indep_exog dict must be "
                                "iterable. Got type %s for converter %s"
                                % (type(indep_endog[eq_key]), eq_key))
#            for del_col in indep_endog[eq_key]:
#                fullexog = np.delete(fullexog,  _col_map[eq_key]+del_col, 1)
#                _col_map[eq_key+1:] -= 1

# Josef's example for deleting reoccuring "rows"
#        fullexog = np.unique(fullexog.T.view([('',fullexog.dtype)]*\
#                fullexog.shape[0])).view(fullexog.dtype).reshape(\
#                fullexog.shape[0],-1)
# From http://article.gmane.org/gmane.comp.python.numeric.general/32276/
# Or Jouni' suggetsion of taking a hash:
# http://www.mail-archive.com/[email protected]/msg04209.html
# not clear to me how this would work though, only if they are the *same*
# elements?
#        self.fullexog = fullexog
        self.wexog = self.whiten(instr_endog)
예제 #34
0
    def t_est_all_to_tbl(self):
        from statsmodels.stats.libqsturng.make_tbls import T,R
        ps, rs, vs, qs = [], [], [], []
        for p in T:
            for v in T[p]:
                for r in iterkeys(R):
                    ps.append(p)
                    vs.append(v)
                    rs.append(r)
                    qs.append(T[p][v][R[r]])

        qs = np.array(qs)
        errors = np.abs(qs-qsturng(ps,rs,vs))/qs
        assert_equal(np.array([]), np.where(errors > .03)[0])
예제 #35
0
def test_mosaic_very_complex():
    # make a scattermatrix of mosaic plots to show the correlations between
    # each pair of variable in a dataset. Could be easily converted into a
    # new function that does this automatically based on the type of data
    key_name = ['gender', 'age', 'health', 'work']
    key_base = (['male', 'female'], ['old',
                                     'young'], ['healty',
                                                'ill'], ['work', 'unemployed'])
    keys = list(product(*key_base))
    data = OrderedDict(zip(keys, range(1, 1 + len(keys))))
    props = {}
    props[('male', 'old')] = {'color': 'r'}
    props[('female', )] = {'color': 'pink'}
    L = len(key_base)
    fig, axes = pylab.subplots(L, L)
    for i in range(L):
        for j in range(L):
            m = set(range(L)).difference(set((i, j)))
            if i == j:
                axes[i, i].text(0.5,
                                0.5,
                                key_name[i],
                                ha='center',
                                va='center')
                axes[i, i].set_xticks([])
                axes[i, i].set_xticklabels([])
                axes[i, i].set_yticks([])
                axes[i, i].set_yticklabels([])
            else:
                ji = max(i, j)
                ij = min(i, j)
                temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r]
                                                                 for r in m),
                                          v) for k, v in iteritems(data)])

                keys = list(iterkeys(temp_data))
                for k in keys:
                    value = _reduce_dict(temp_data, k[:2])
                    temp_data[k[:2]] = value
                    del temp_data[k]
                mosaic(temp_data,
                       ax=axes[i, j],
                       axes_label=False,
                       properties=props,
                       gap=0.05,
                       horizontal=i > j)
    pylab.suptitle('old males should look bright red,  (plot 4 of 4)')
    #pylab.show()
    pylab.close('all')
예제 #36
0
def _normalize_data(data, index):
    """normalize the data to a dict with tuples of strings as keys
    right now it works with:

        0 - dictionary (or equivalent mappable)
        1 - pandas.Series with simple or hierarchical indexes
        2 - numpy.ndarrays
        3 - everything that can be converted to a numpy array
        4 - pandas.DataFrame (via the _normalize_dataframe function)
    """
    # if data is a dataframe we need to take a completely new road
    # before coming back here. Use the hasattr to avoid importing
    # pandas explicitly
    if hasattr(data, 'pivot') and hasattr(data, 'groupby'):
        data = _normalize_dataframe(data, index)
        index = None
    # can it be used as a dictionary?
    try:
        items = list(iteritems(data))
    except AttributeError:
        # ok, I cannot use the data as a dictionary
        # Try to convert it to a numpy array, or die trying
        data = np.asarray(data)
        temp = OrderedDict()
        for idx in np.ndindex(data.shape):
            name = tuple(i for i in idx)
            temp[name] = data[idx]
        data = temp
        items = list(iteritems(data))
    # make all the keys a tuple, even if simple numbers
    data = OrderedDict([_tuplify(k), v] for k, v in items)
    categories_levels = _categories_level(list(iterkeys(data)))
    # fill the void in the counting dictionary
    indexes = product(*categories_levels)
    contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes])
    data = contingency
    # reorder the keys order according to the one specified by the user
    # or if the index is None convert it into a simple list
    # right now it doesn't do any check, but can be modified in the future
    index = lrange(len(categories_levels)) if index is None else index
    contingency = OrderedDict()
    for key, value in iteritems(data):
        new_key = tuple(key[i] for i in index)
        contingency[new_key] = value
    data = contingency
    return data
예제 #37
0
def _normalize_data(data, index):
    """normalize the data to a dict with tuples of strings as keys
    right now it works with:

        0 - dictionary (or equivalent mappable)
        1 - pandas.Series with simple or hierarchical indexes
        2 - numpy.ndarrays
        3 - everything that can be converted to a numpy array
        4 - pandas.DataFrame (via the _normalize_dataframe function)
    """
    # if data is a dataframe we need to take a completely new road
    # before coming back here. Use the hasattr to avoid importing
    # pandas explicitly
    if hasattr(data, 'pivot') and hasattr(data, 'groupby'):
        data = _normalize_dataframe(data, index)
        index = None
    # can it be used as a dictionary?
    try:
        items = list(iteritems(data))
    except AttributeError:
        # ok, I cannot use the data as a dictionary
        # Try to convert it to a numpy array, or die trying
        data = np.asarray(data)
        temp = OrderedDict()
        for idx in np.ndindex(data.shape):
            name = tuple(i for i in idx)
            temp[name] = data[idx]
        data = temp
        items = list(iteritems(data))
    # make all the keys a tuple, even if simple numbers
    data = OrderedDict([_tuplify(k), v] for k, v in items)
    categories_levels = _categories_level(list(iterkeys(data)))
    # fill the void in the counting dictionary
    indexes = product(*categories_levels)
    contingency = OrderedDict([(k, data.get(k, 0)) for k in indexes])
    data = contingency
    # reorder the keys order according to the one specified by the user
    # or if the index is None convert it into a simple list
    # right now it doesn't do any check, but can be modified in the future
    index = lrange(len(categories_levels)) if index is None else index
    contingency = OrderedDict()
    for key, value in iteritems(data):
        new_key = tuple(key[i] for i in index)
        contingency[new_key] = value
    data = contingency
    return data
예제 #38
0
def _recode(x, levels):
    """ Recode categorial data to int factor.

    Parameters
    ----------
    x : array-like
        array like object supporting with numpy array methods of categorially
        coded data.
    levels : dict
        mapping of labels to integer-codings

    Returns
    -------
    out : instance numpy.ndarray

    """
    from pandas import Series
    name = None
    index = None

    if isinstance(x, Series):
        name = x.name
        index = x.index
        x = x.values

    if x.dtype.type not in [np.str_, np.object_]:
        raise ValueError('This is not a categorial factor.'
                         ' Array of str type required.')

    elif not isinstance(levels, dict):
        raise ValueError('This is not a valid value for levels.'
                         ' Dict required.')

    elif not (np.unique(x) == np.unique(list(iterkeys(levels)))).all():
        raise ValueError('The levels do not match the array values.')

    else:
        out = np.empty(x.shape[0], dtype=np.int)
        for level, coding in iteritems(levels):
            out[x == level] = coding

        if name:
            out = Series(out, name=name, index=index)

        return out
예제 #39
0
def _create_default_properties(data):
    """"Create the default properties of the mosaic given the data
    first it will varies the color hue (first category) then the color
    saturation (second category) and then the color value
    (third category).  If a fourth category is found, it will put
    decoration on the rectangle.  Doesn't manage more than four
    level of categories
    """
    categories_levels = _categories_level(list(iterkeys(data)))
    Nlevels = len(categories_levels)
    # first level, the hue
    L = len(categories_levels[0])
    # hue = np.linspace(1.0, 0.0, L+1)[:-1]
    hue = np.linspace(0.0, 1.0, L + 2)[:-2]
    # second level, the saturation
    L = len(categories_levels[1]) if Nlevels > 1 else 1
    saturation = np.linspace(0.5, 1.0, L + 1)[:-1]
    # third level, the value
    L = len(categories_levels[2]) if Nlevels > 2 else 1
    value = np.linspace(0.5, 1.0, L + 1)[:-1]
    # fourth level, the hatch
    L = len(categories_levels[3]) if Nlevels > 3 else 1
    hatch = ['', '/', '-', '|', '+'][:L + 1]
    # convert in list and merge with the levels
    hue = lzip(list(hue), categories_levels[0])
    saturation = lzip(list(saturation),
                     categories_levels[1] if Nlevels > 1 else [''])
    value = lzip(list(value),
                     categories_levels[2] if Nlevels > 2 else [''])
    hatch = lzip(list(hatch),
                     categories_levels[3] if Nlevels > 3 else [''])
    # create the properties dictionary
    properties = {}
    for h, s, v, t in product(hue, saturation, value, hatch):
        hv, hn = h
        sv, sn = s
        vv, vn = v
        tv, tn = t
        level = (hn,) + ((sn,) if sn else tuple())
        level = level + ((vn,) if vn else tuple())
        level = level + ((tn,) if tn else tuple())
        hsv = array([hv, sv, vv])
        prop = {'color': _single_hsv_to_rgb(hsv), 'hatch': tv, 'lw': 0}
        properties[level] = prop
    return properties
예제 #40
0
def _create_default_properties(data):
    """"Create the default properties of the mosaic given the data
    first it will varies the color hue (first category) then the color
    saturation (second category) and then the color value
    (third category).  If a fourth category is found, it will put
    decoration on the rectangle.  Doesn't manage more than four
    level of categories
    """
    categories_levels = _categories_level(list(iterkeys(data)))
    Nlevels = len(categories_levels)
    # first level, the hue
    L = len(categories_levels[0])
    # hue = np.linspace(1.0, 0.0, L+1)[:-1]
    hue = np.linspace(0.0, 1.0, L + 2)[:-2]
    # second level, the saturation
    L = len(categories_levels[1]) if Nlevels > 1 else 1
    saturation = np.linspace(0.5, 1.0, L + 1)[:-1]
    # third level, the value
    L = len(categories_levels[2]) if Nlevels > 2 else 1
    value = np.linspace(0.5, 1.0, L + 1)[:-1]
    # fourth level, the hatch
    L = len(categories_levels[3]) if Nlevels > 3 else 1
    hatch = ['', '/', '-', '|', '+'][:L + 1]
    # convert in list and merge with the levels
    hue = lzip(list(hue), categories_levels[0])
    saturation = lzip(list(saturation),
                     categories_levels[1] if Nlevels > 1 else [''])
    value = lzip(list(value),
                     categories_levels[2] if Nlevels > 2 else [''])
    hatch = lzip(list(hatch),
                     categories_levels[3] if Nlevels > 3 else [''])
    # create the properties dictionary
    properties = {}
    for h, s, v, t in product(hue, saturation, value, hatch):
        hv, hn = h
        sv, sn = s
        vv, vn = v
        tv, tn = t
        level = (hn,) + ((sn,) if sn else tuple())
        level = level + ((vn,) if vn else tuple())
        level = level + ((tn,) if tn else tuple())
        hsv = array([hv, sv, vv])
        prop = {'color': _single_hsv_to_rgb(hsv), 'hatch': tv, 'lw': 0}
        properties[level] = prop
    return properties
def _recode(x, levels):
    """ Recode categorial data to int factor.

    Parameters
    ----------
    x : array-like
        array like object supporting with numpy array methods of categorially
        coded data.
    levels : dict
        mapping of labels to integer-codings

    Returns
    -------
    out : instance numpy.ndarray

    """
    from pandas import Series
    name = None
    index = None

    if isinstance(x, Series):
        name = x.name
        index = x.index
        x = x.values

    if x.dtype.type not in [np.str_, np.object_]:
        raise ValueError('This is not a categorial factor.'
                         ' Array of str type required.')

    elif not isinstance(levels, dict):
        raise ValueError('This is not a valid value for levels.'
                         ' Dict required.')

    elif not (np.unique(x) == np.unique(list(iterkeys(levels)))).all():
        raise ValueError('The levels do not match the array values.')

    else:
        out = np.empty(x.shape[0], dtype=np.int)
        for level, coding in iteritems(levels):
            out[x == level] = coding

        if name:
            out = Series(out, name=name, index=index)

        return out
예제 #42
0
    def update(self, params):
        """
        Update the global odds ratio based on the current value of
        params.
        """

        endog = self.model.endog_li
        cpp = self.cpp
        cached_means = self.model.cached_means

        # This will happen if all the clusters have only
        # one observation
        if len(cpp[0]) == 0:
            return

        tables = {}
        for ii in cpp[0]:
            tables[ii] = np.zeros((2, 2), dtype=np.float64)

        for i in range(self.model.num_group):

            endog_expval, _ = cached_means[i]

            emat_11 = self.get_eyy(endog_expval, i)
            emat_10 = endog_expval[:, None] - emat_11
            emat_01 = -emat_11 + endog_expval
            emat_00 = 1. - (emat_11 + emat_10 + emat_01)

            cpp1 = cpp[i]
            for ky in iterkeys(cpp1):
                ix = cpp1[ky]
                tables[ky][1, 1] += emat_11[ix[:, 0], ix[:, 1]].sum()
                tables[ky][1, 0] += emat_10[ix[:, 0], ix[:, 1]].sum()
                tables[ky][0, 1] += emat_01[ix[:, 0], ix[:, 1]].sum()
                tables[ky][0, 0] += emat_00[ix[:, 0], ix[:, 1]].sum()

        cor_expval = self.pooled_odds_ratio(list(itervalues(tables)))

        self.dep_params *= self.crude_or / cor_expval
        if not np.isfinite(self.dep_params):
            self.dep_params = 1.
            warnings.warn("dep_params became inf, resetting to 1",
                          ConvergenceWarning)
예제 #43
0
    def update(self, params):
        """
        Update the global odds ratio based on the current value of
        params.
        """

        endog = self.model.endog_li
        cpp = self.cpp
        cached_means = self.model.cached_means

        # This will happen if all the clusters have only
        # one observation
        if len(cpp[0]) == 0:
            return

        tables = {}
        for ii in cpp[0]:
            tables[ii] = np.zeros((2, 2), dtype=np.float64)

        for i in range(self.model.num_group):

            endog_expval, _ = cached_means[i]

            emat_11 = self.get_eyy(endog_expval, i)
            emat_10 = endog_expval[:, None] - emat_11
            emat_01 = -emat_11 + endog_expval
            emat_00 = 1. - (emat_11 + emat_10 + emat_01)

            cpp1 = cpp[i]
            for ky in iterkeys(cpp1):
                ix = cpp1[ky]
                tables[ky][1, 1] += emat_11[ix[:, 0], ix[:, 1]].sum()
                tables[ky][1, 0] += emat_10[ix[:, 0], ix[:, 1]].sum()
                tables[ky][0, 1] += emat_01[ix[:, 0], ix[:, 1]].sum()
                tables[ky][0, 0] += emat_00[ix[:, 0], ix[:, 1]].sum()

        cor_expval = self.pooled_odds_ratio(list(itervalues(tables)))

        self.dep_params *= self.crude_or / cor_expval
        if not np.isfinite(self.dep_params):
            self.dep_params = 1.
            warnings.warn("dep_params became inf, resetting to 1",
                          ConvergenceWarning)
예제 #44
0
def _statistical_coloring(data):
    """evaluate colors from the indipendence properties of the matrix
    It will encounter problem if one category has all zeros
    """
    data = _normalize_data(data, None)
    categories_levels = _categories_level(list(iterkeys(data)))
    Nlevels = len(categories_levels)
    total = 1.0 * sum(v for v in itervalues(data))
    # count the proportion of observation
    # for each level that has the given name
    # at each level
    levels_count = []
    for level_idx in range(Nlevels):
        proportion = {}
        for level in categories_levels[level_idx]:
            proportion[level] = 0.0
            for key, value in iteritems(data):
                if level == key[level_idx]:
                    proportion[level] += value
            proportion[level] /= total
        levels_count.append(proportion)
    # for each key I obtain the expected value
    # and it's standard deviation from a binomial distribution
    # under the hipothesys of independence
    expected = {}
    for key, value in iteritems(data):
        base = 1.0
        for i, k in enumerate(key):
            base *= levels_count[i][k]
        expected[key] = base * total, np.sqrt(total * base * (1.0 - base))
    # now we have the standard deviation of distance from the
    # expected value for each tile. We create the colors from this
    sigmas = dict((k, (data[k] - m) / s) for k, (m, s) in iteritems(expected))
    props = {}
    for key, dev in iteritems(sigmas):
        red = 0.0 if dev < 0 else (dev / (1 + dev))
        blue = 0.0 if dev > 0 else (dev / (-1 + dev))
        green = (1.0 - red - blue) / 2.0
        hatch = 'x' if dev > 2 else 'o' if dev < -2 else ''
        props[key] = {'color': [red, green, blue], 'hatch': hatch}
    return props
예제 #45
0
def _statistical_coloring(data):
    """evaluate colors from the indipendence properties of the matrix
    It will encounter problem if one category has all zeros
    """
    data = _normalize_data(data, None)
    categories_levels = _categories_level(list(iterkeys(data)))
    Nlevels = len(categories_levels)
    total = 1.0 * sum(v for v in itervalues(data))
    # count the proportion of observation
    # for each level that has the given name
    # at each level
    levels_count = []
    for level_idx in range(Nlevels):
        proportion = {}
        for level in categories_levels[level_idx]:
            proportion[level] = 0.0
            for key, value in iteritems(data):
                if level == key[level_idx]:
                    proportion[level] += value
            proportion[level] /= total
        levels_count.append(proportion)
    # for each key I obtain the expected value
    # and it's standard deviation from a binomial distribution
    # under the hipothesys of independence
    expected = {}
    for key, value in iteritems(data):
        base = 1.0
        for i, k in enumerate(key):
            base *= levels_count[i][k]
        expected[key] = base * total, np.sqrt(total * base * (1.0 - base))
    # now we have the standard deviation of distance from the
    # expected value for each tile. We create the colors from this
    sigmas = dict((k, (data[k] - m) / s) for k, (m, s) in iteritems(expected))
    props = {}
    for key, dev in iteritems(sigmas):
        red = 0.0 if dev < 0 else (dev / (1 + dev))
        blue = 0.0 if dev > 0 else (dev / (-1 + dev))
        green = (1.0 - red - blue) / 2.0
        hatch = 'x' if dev > 2 else 'o' if dev < -2 else ''
        props[key] = {'color': [red, green, blue], 'hatch': hatch}
    return props
예제 #46
0
def test_mosaic_very_complex():
    # make a scattermatrix of mosaic plots to show the correlations between
    # each pair of variable in a dataset. Could be easily converted into a
    # new function that does this automatically based on the type of data
    key_name = ['gender', 'age', 'health', 'work']
    key_base = (['male', 'female'], ['old', 'young'],
                    ['healty', 'ill'], ['work', 'unemployed'])
    keys = list(product(*key_base))
    data = OrderedDict(zip(keys, range(1, 1 + len(keys))))
    props = {}
    props[('male', 'old')] = {'color': 'r'}
    props[('female',)] = {'color': 'pink'}
    L = len(key_base)
    fig, axes = pylab.subplots(L, L)
    for i in range(L):
        for j in range(L):
            m = set(range(L)).difference(set((i, j)))
            if i == j:
                axes[i, i].text(0.5, 0.5, key_name[i],
                                ha='center', va='center')
                axes[i, i].set_xticks([])
                axes[i, i].set_xticklabels([])
                axes[i, i].set_yticks([])
                axes[i, i].set_yticklabels([])
            else:
                ji = max(i, j)
                ij = min(i, j)
                temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v)
                                            for k, v in iteritems(data)])

                keys = list(iterkeys(temp_data))
                for k in keys:
                    value = _reduce_dict(temp_data, k[:2])
                    temp_data[k[:2]] = value
                    del temp_data[k]
                mosaic(temp_data, ax=axes[i, j], axes_label=False,
                       properties=props, gap=0.05, horizontal=i > j)
    pylab.suptitle('old males should look bright red,  (plot 4 of 4)')
    #pylab.show()
    pylab.close('all')
예제 #47
0
def handle_formula_data(Y, X, formula, depth=0):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            return dmatrices(formula, (Y, X), 2, return_type='dataframe')
        else:
            return dmatrices(formula, (Y, X), 2, return_type='dataframe')
    else:
        if data_util._is_using_pandas(Y, None):
            return dmatrices(formula, Y, 2, return_type='dataframe')
        else:
            return dmatrices(formula, Y, 2, return_type='dataframe')
예제 #48
0
def _hierarchical_split(count_dict, horizontal=True, gap=0.05):
    """
    Split a square in a hierarchical way given a contingency table.

    Hierarchically split the unit square in alternate directions
    in proportion to the subdivision contained in the contingency table
    count_dict.  This is the function that actually perform the tiling
    for the creation of the mosaic plot.  If the gap array has been specified
    it will insert a corresponding amount of space (proportional to the
    unit lenght), while retaining the proportionality of the tiles.

    Parameters
    ----------
    count_dict : dict
        Dictionary containing the contingency table.
        Each category should contain a non-negative number
        with a tuple as index.  It expects that all the combination
        of keys to be representes; if that is not true, will
        automatically consider the missing values as 0
    horizontal : bool
        The starting direction of the split (by default along
        the horizontal axis)
    gap : float or array of floats
        The list of gaps to be applied on each subdivision.
        If the lenght of the given array is less of the number
        of subcategories (or if it's a single number) it will extend
        it with exponentially decreasing gaps

    Returns
    ---------
    base_rect : dict
        A dictionary containing the result of the split.
        To each key is associated a 4-tuple of coordinates
        that are required to create the corresponding rectangle:

            0 - x position of the lower left corner
            1 - y position of the lower left corner
            2 - width of the rectangle
            3 - height of the rectangle
    """
    # this is the unit square that we are going to divide
    base_rect = OrderedDict([(tuple(), (0, 0, 1, 1))])
    # get the list of each possible value for each level
    categories_levels = _categories_level(list(iterkeys(count_dict)))
    L = len(categories_levels)

    # recreate the gaps vector starting from an int
    if not np.iterable(gap):
        gap = [gap / 1.5**idx for idx in range(L)]
    # extend if it's too short
    if len(gap) < L:
        last = gap[-1]
        gap = list(*gap) + [last / 1.5**idx for idx in range(L)]
    # trim if it's too long
    gap = gap[:L]
    # put the count dictionay in order for the keys
    # this will allow some code simplification
    count_ordered = OrderedDict([(k, count_dict[k])
                                 for k in list(product(*categories_levels))])
    for cat_idx, cat_enum in enumerate(categories_levels):
        # get the partial key up to the actual level
        base_keys = list(product(*categories_levels[:cat_idx]))
        for key in base_keys:
            # for each partial and each value calculate how many
            # observation we have in the counting dictionary
            part_count = [
                _reduce_dict(count_ordered, key + (partial, ))
                for partial in cat_enum
            ]
            # reduce the gap for subsequents levels
            new_gap = gap[cat_idx]
            # split the given subkeys in the rectangle dictionary
            base_rect = _key_splitting(base_rect, cat_enum, part_count, key,
                                       horizontal, new_gap)
        horizontal = not horizontal
    return base_rect
예제 #49
0
    def __init__(self, y, design, model_type=r.lm, **kwds):
        ''' Set up and estimate R model with data and design '''
        r.library('MASS') # still needs to be in test, but also here for
                          # logical tests at the end not to show an error
        self.y = np.array(y)
        self.design = np.array(design)
        self.model_type = model_type
        self._design_cols = ['x.%d' % (i+1) for i in range(
            self.design.shape[1])]
        # Note the '-1' for no intercept - this is included in the design
        self.formula = r('y ~ %s-1' % '+'.join(self._design_cols))
        self.frame = r.data_frame(y=y, x=self.design)
        rpy.set_default_mode(rpy.NO_CONVERSION)
        results = self.model_type(self.formula,
                                    data = self.frame, **kwds)
        self.robj = results # keep the Robj model so it can be
                            # used in the tests
        rpy.set_default_mode(rpy.BASIC_CONVERSION)
        rsum = r.summary(results)
        self.rsum = rsum
        # Provide compatible interface with scipy models
        self.results = results.as_py()

#        coeffs = self.results['coefficients']
#        self.beta0 = np.array([coeffs[c] for c in self._design_cols])
        self.nobs = len(self.results['residuals'])
        if isinstance(self.results['residuals'], dict):
            self.resid = np.zeros((len(iterkeys(self.results['residuals']))))
            for i in iterkeys(self.results['residuals']):
                self.resid[int(i)-1] = self.results['residuals'][i]
        else:
            self.resid = self.results['residuals']
        self.fittedvalues = self.results['fitted.values']
        self.df_resid = self.results['df.residual']
        self.params = rsum['coefficients'][:,0]
        self.bse = rsum['coefficients'][:,1]
        self.bt = rsum['coefficients'][:,2]
        try:
            self.pvalues = rsum['coefficients'][:,3]
        except: pass
        self.rsquared = rsum.setdefault('r.squared', None)
        self.rsquared_adj = rsum.setdefault('adj.r.squared', None)
        self.aic_R = rsum.setdefault('aic', None)
        self.fvalue = rsum.setdefault('fstatistic', None)
        if self.fvalue and isinstance(self.fvalue, dict):
            self.fvalue = self.fvalue.setdefault('value', None) # for wls
        df = rsum.setdefault('df', None)
        if df:  # for RLM, works for other models?
            self.df_model = df[0]-1 # R counts intercept
            self.df_resid = df[1]
        self.bcov_unscaled = rsum.setdefault('cov.unscaled', None)
        self.bcov = rsum.setdefault('cov.scaled', None)
        if 'sigma' in rsum:
            self.scale = rsum['sigma']
        elif 'dispersion' in rsum:
            self.scale = rsum['dispersion']
        else:
            self.scale = None
        self.llf = r.logLik(results)

        if model_type == r.glm:
            self.getglm()
        if model_type == r.rlm:
            self.getrlm()
예제 #50
0
    form.termcolumns('C')
  File "C:\Josef\eclipsegworkspace\statsmodels-josef-experimental\scikits\statsmodels\sandbox\formula.py", line 494, in termcolumns
    raise ValueError('term not in formula')
ValueError: term not in formula


'''
print(form.hasterm('C'))
print(form.termcolumns(formula.Term('C')))  #doesn't work with string argument

#Example: use two columns and get contrast

f2 = (form['A'] + form['B'])
print(f2)
print(repr(f2))
list(iterkeys(f2.namespace))  #namespace is still empty
f2.namespace = namespace  #associate data
iterkeys(f2.namespace)
f2.design().shape
contrast.Contrast(formula.Term('A'), f2).matrix
'''
>>> f2 = (form['A']+form['B'])
>>> print f2
<formula: A + B>
>>> print repr(f2)
<statsmodels.sandbox.formula.Formula object at 0x036BAE70>
>>> f2.namespace.keys()   #namespace is still empty
[]
>>> f2.namespace = namespace  #associate data
>>> f2.namespace.keys()
['A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'J']
예제 #51
0
        return len(incorrectli)==0, correctli, incorrectli



if __name__ == '__main__':
    data = np.load(r"E:\Josef\eclipsegworkspace\statsmodels-josef-experimental-030\dist\statsmodels-0.3.0dev_with_Winhelp_a2\statsmodels-0.3.0dev\scikits\statsmodels\tsa\vector_ar\tests\results\vars_results.npz")
    res_var =  HoldIt('var_results')
    for d in data:
        setattr(res_var, d, data[d])
    np.set_printoptions(precision=120, linewidth=100)
    res_var.save(filename='testsave.py', header=True,
                  comment='VAR test data converted from vars_results.npz')

    import testsave

    for d in data:
        print(d)
        correct = np.all(data[d] == getattr(testsave.var_results, d))
        if not correct and not data[d].dtype == np.dtype('object'):
            correct = np.allclose(data[d], getattr(testsave.var_results, d),
                              rtol=1e-16, atol=1e-16)
            if not correct: print("inexact precision")
        if not correct:
            correlem =[np.all(data[d].item()[k] ==
                              getattr(testsave.var_results, d).item()[k])
                       for k in iterkeys(data[d].item())]
            if not correlem:
                print(d, "wrong")

    print(res_var.verify())
예제 #52
0
    def summary(self, stats='basic', columns='all', orientation='auto'):
        """
        Return a summary of descriptive statistics.

        Parameters
        ----------
        stats: list or str
            The desired statistics, Accepts 'basic' or 'all' or a list.
               'basic' = ('obs', 'mean', 'std', 'min', 'max')
               'all' = ('obs', 'mean', 'std', 'min', 'max', 'ptp', 'var',
                        'mode', 'meadian', 'skew', 'uss', 'kurtosis',
                        'percentiles')
        columns : list or str
          The columns/variables to report the statistics, default is 'all'
          If an object with named columns is given, you may specify the
          column names. For example
        """
        #NOTE
        # standard array: Specifiy column numbers (NEED TO TEST)
        # percentiles currently broken
        # mode requires mode_val and mode_bin separately
        if self._arraytype is None:
            self._array_typer()

        if stats == 'basic':
            stats = ('obs', 'mean', 'std', 'min', 'max')
        elif stats == 'all':
            #stats = self.univariate.keys()
            #dict doesn't keep an order, use full list instead
            stats = ['obs', 'mean', 'std', 'min', 'max', 'ptp', 'var',
                     'mode_val', 'mode_bin', 'median', 'uss', 'skew',
                     'kurtosis', 'percentiles']
        else:
            for astat in stats:
                pass
                #assert astat in self.univariate

        #hack around percentiles multiple output

        #bad naming
        import scipy.stats
        #BUG: the following has all per the same per=99
        ##perdict = dict(('perc_%2d'%per, [lambda x:
        #       scipy.stats.scoreatpercentile(x, per), None, None])
        ##          for per in (1,5,10,25,50,75,90,95,99))

        def _fun(per):
            return lambda x: scipy.stats.scoreatpercentile(x, per)

        perdict = dict(('perc_%02d' % per, [_fun(per), None, None])
                       for per in (1,5,10,25,50,75,90,95,99))

        if 'percentiles' in stats:
            self.univariate.update(perdict)
            idx = stats.index('percentiles')
            stats[idx:idx+1] = sorted(iterkeys(perdict))



        #JP: this doesn't allow a change in sequence, sequence in stats is
        #ignored
        #this is just an if condition
        if any([aitem[1] for aitem in iteritems(self.univariate) if aitem[0] in
                stats]):
            if columns == 'all':
                self._columns_list = []
                if self._arraytype == 'sctruct':
                    self._columns_list = self.dataset.dtype.names
                    #self._columns_list = [col for col in
                    #                      self.dataset.dtype.names if
                    #        (self._is_dtype_like(col)=='number')]
                else:
                    self._columns_list = lrange(self.dataset.shape[1])
            else:
                self._columns_list = columns
                if self._arraytype == 'sctruct':
                    for col in self._columns_list:
                        assert (col in self.dataset.dtype.names)
                else:
                    assert self._is_dtype_like(self.dataset) == 'number'

            columstypes = self.dataset.dtype
            #TODO: do we need to make sure they dtype is float64 ?
            for astat in stats:
                calc = self.univariate[astat]
                if self._arraytype == 'sctruct':
                    calc[1] =  self._columns_list
                    calc[2] = [calc[0](self.dataset[col]) for col in
                            self._columns_list if (self._is_dtype_like(col) ==
                                                      'number')]
                    #calc[2].append([len(np.unique(self.dataset[col])) for col
                    #                in self._columns_list if
                    #                self._is_dtype_like(col)=='string']
                else:
                    calc[1] = ['Col '+str(col) for col in self._columns_list]
                    calc[2] = [calc[0](self.dataset[:,col]) for col in
                               self._columns_list]
            return self.print_summary(stats, orientation=orientation)
        else:
            return self.print_summary(stats, orientation=orientation)
예제 #53
0
    )
    res_var = HoldIt('var_results')
    for d in data:
        setattr(res_var, d, data[d])
    np.set_printoptions(precision=120, linewidth=100)
    res_var.save(filename='testsave.py',
                 header=True,
                 comment='VAR test data converted from vars_results.npz')

    import testsave

    for d in data:
        print(d)
        correct = np.all(data[d] == getattr(testsave.var_results, d))
        if not correct and not data[d].dtype == np.dtype('object'):
            correct = np.allclose(data[d],
                                  getattr(testsave.var_results, d),
                                  rtol=1e-16,
                                  atol=1e-16)
            if not correct: print("inexact precision")
        if not correct:
            correlem = [
                np.all(data[d].item()[k] == getattr(testsave.var_results,
                                                    d).item()[k])
                for k in iterkeys(data[d].item())
            ]
            if not correlem:
                print(d, "wrong")

    print(res_var.verify())
예제 #54
0
def _create_labels(rects, horizontal, ax, rotation):
    """find the position of the label for each value of each category

    right now it supports only up to the four categories

    ax: the axis on which the label should be applied
    rotation: the rotation list for each side
    """
    categories = _categories_level(list(iterkeys(rects)))
    if len(categories) > 4:
        msg = ("maximum of 4 level supported for axes labeling... and 4"
               "is already a lot of levels, are you sure you need them all?")
        raise ValueError(msg)
    labels = {}
    #keep it fixed as will be used a lot of times
    items = list(iteritems(rects))
    vertical = not horizontal

    #get the axis ticks and labels locator to put the correct values!
    ax2 = ax.twinx()
    ax3 = ax.twiny()
    #this is the order of execution for horizontal disposition
    ticks_pos = [ax.set_xticks, ax.set_yticks, ax3.set_xticks, ax2.set_yticks]
    ticks_lab = [
        ax.set_xticklabels, ax.set_yticklabels, ax3.set_xticklabels,
        ax2.set_yticklabels
    ]
    #for the vertical one, rotate it by one
    if vertical:
        ticks_pos = ticks_pos[1:] + ticks_pos[:1]
        ticks_lab = ticks_lab[1:] + ticks_lab[:1]
    #clean them
    for pos, lab in zip(ticks_pos, ticks_lab):
        pos([])
        lab([])
    #for each level, for each value in the level, take the mean of all
    #the sublevel that correspond to that partial key
    for level_idx, level in enumerate(categories):
        #this dictionary keep the labels only for this level
        level_ticks = dict()
        for value in level:
            #to which level it should refer to get the preceding
            #values of labels? it's rather a tricky question...
            #this is dependent on the side. It's a very crude management
            #but I couldn't think a more general way...
            if horizontal:
                if level_idx == 3:
                    index_select = [-1, -1, -1]
                else:
                    index_select = [+0, -1, -1]
            else:
                if level_idx == 3:
                    index_select = [+0, -1, +0]
                else:
                    index_select = [-1, -1, -1]
            #now I create the base key name and append the current value
            #It will search on all the rects to find the corresponding one
            #and use them to evaluate the mean position
            basekey = tuple(categories[i][index_select[i]]
                            for i in range(level_idx))
            basekey = basekey + (value, )
            subset = dict(
                (k, v) for k, v in items if basekey == k[:level_idx + 1])
            #now I extract the center of all the tiles and make a weighted
            #mean of all these center on the area of the tile
            #this should give me the (more or less) correct position
            #of the center of the category

            vals = list(itervalues(subset))
            W = sum(w * h for (x, y, w, h) in vals)
            x_lab = sum(_get_position(x, w, h, W) for (x, y, w, h) in vals)
            y_lab = sum(_get_position(y, h, w, W) for (x, y, w, h) in vals)
            #now base on the ordering, select which position to keep
            #needs to be written in a more general form of 4 level are enough?
            #should give also the horizontal and vertical alignment
            side = (level_idx + vertical) % 4
            level_ticks[value] = y_lab if side % 2 else x_lab
        #now we add the labels of this level to the correct axis

        ticks_pos[level_idx](list(itervalues(level_ticks)))
        ticks_lab[level_idx](list(iterkeys(level_ticks)),
                             rotation=rotation[level_idx])
    return labels
예제 #55
0
def _hierarchical_split(count_dict, horizontal=True, gap=0.05):
    """
    Split a square in a hierarchical way given a contingency table.

    Hierarchically split the unit square in alternate directions
    in proportion to the subdivision contained in the contingency table
    count_dict.  This is the function that actually perform the tiling
    for the creation of the mosaic plot.  If the gap array has been specified
    it will insert a corresponding amount of space (proportional to the
    unit lenght), while retaining the proportionality of the tiles.

    Parameters
    ----------
    count_dict : dict
        Dictionary containing the contingency table.
        Each category should contain a non-negative number
        with a tuple as index.  It expects that all the combination
        of keys to be representes; if that is not true, will
        automatically consider the missing values as 0
    horizontal : bool
        The starting direction of the split (by default along
        the horizontal axis)
    gap : float or array of floats
        The list of gaps to be applied on each subdivision.
        If the lenght of the given array is less of the number
        of subcategories (or if it's a single number) it will extend
        it with exponentially decreasing gaps

    Returns
    ----------
    base_rect : dict
        A dictionary containing the result of the split.
        To each key is associated a 4-tuple of coordinates
        that are required to create the corresponding rectangle:

            0 - x position of the lower left corner
            1 - y position of the lower left corner
            2 - width of the rectangle
            3 - height of the rectangle
    """
    # this is the unit square that we are going to divide
    base_rect = OrderedDict([(tuple(), (0, 0, 1, 1))])
    # get the list of each possible value for each level
    categories_levels = _categories_level(list(iterkeys(count_dict)))
    L = len(categories_levels)

    # recreate the gaps vector starting from an int
    if not np.iterable(gap):
        gap = [gap / 1.5 ** idx for idx in range(L)]
    # extend if it's too short
    if len(gap) < L:
        last = gap[-1]
        gap = list(*gap) + [last / 1.5 ** idx for idx in range(L)]
    # trim if it's too long
    gap = gap[:L]
    # put the count dictionay in order for the keys
    # this will allow some code simplification
    count_ordered = OrderedDict([(k, count_dict[k])
                        for k in list(product(*categories_levels))])
    for cat_idx, cat_enum in enumerate(categories_levels):
        # get the partial key up to the actual level
        base_keys = list(product(*categories_levels[:cat_idx]))
        for key in base_keys:
            # for each partial and each value calculate how many
            # observation we have in the counting dictionary
            part_count = [_reduce_dict(count_ordered, key + (partial,))
                            for partial in cat_enum]
            # reduce the gap for subsequents levels
            new_gap = gap[cat_idx]
            # split the given subkeys in the rectangle dictionary
            base_rect = _key_splitting(base_rect, cat_enum, part_count, key,
                                       horizontal, new_gap)
        horizontal = not horizontal
    return base_rect
예제 #56
0
    form.termcolumns('C')
  File "C:\Josef\eclipsegworkspace\statsmodels-josef-experimental\scikits\statsmodels\sandbox\formula.py", line 494, in termcolumns
    raise ValueError('term not in formula')
ValueError: term not in formula


'''
print(form.hasterm('C'))
print(form.termcolumns(formula.Term('C')))  #doesn't work with string argument

#Example: use two columns and get contrast

f2 = (form['A']+form['B'])
print(f2)
print(repr(f2))
list(iterkeys(f2.namespace))   #namespace is still empty
f2.namespace = namespace  #associate data
iterkeys(f2.namespace)
f2.design().shape
contrast.Contrast(formula.Term('A'), f2).matrix

'''
>>> f2 = (form['A']+form['B'])
>>> print f2
<formula: A + B>
>>> print repr(f2)
<statsmodels.sandbox.formula.Formula object at 0x036BAE70>
>>> f2.namespace.keys()   #namespace is still empty
[]
>>> f2.namespace = namespace  #associate data
>>> f2.namespace.keys()
예제 #57
0
from __future__ import print_function
from statsmodels.compat.python import iterkeys
from rpy import r
import numpy as np
import statsmodels.api as sm

examples = [1, 2]

if 1 in examples:
    data = sm.datasets.longley.load(as_pandas=False)
    y, x = data.endog, sm.add_constant(data.exog, prepend=False)
    des_cols = ['x.%d' % (i + 1) for i in range(x.shape[1])]
    formula = r('y~%s-1' % '+'.join(des_cols))
    frame = r.data_frame(y=y, x=x)
    results = r.lm(formula, data=frame)
    print(list(iterkeys(results)))
    print(results['coefficients'])

if 2 in examples:
    data2 = sm.datasets.star98.load(as_pandas=False)
    y2, x2 = data2.endog, sm.add_constant(data2.exog, prepend=False)
    import rpy
    y2 = y2[:, 0] / y2.sum(axis=1)
    des_cols2 = ['x.%d' % (i + 1) for i in range(x2.shape[1])]
    formula2 = r('y~%s-1' % '+'.join(des_cols2))
    frame2 = r.data_frame(y=y2, x=x2)
    results2 = r.glm(formula2, data=frame2, family='binomial')
    params_est = [
        results2['coefficients'][k] for k in sorted(results2['coefficients'])
    ]
    print(params_est)
예제 #58
0
    def summary(self, stats='basic', columns='all', orientation='auto'):
        """
        Return a summary of descriptive statistics.

        Parameters
        -----------
        stats: list or str
            The desired statistics, Accepts 'basic' or 'all' or a list.
               'basic' = ('obs', 'mean', 'std', 'min', 'max')
               'all' = ('obs', 'mean', 'std', 'min', 'max', 'ptp', 'var',
                        'mode', 'meadian', 'skew', 'uss', 'kurtosis',
                        'percentiles')
        columns : list or str
          The columns/variables to report the statistics, default is 'all'
          If an object with named columns is given, you may specify the
          column names. For example
        """
        #NOTE
        # standard array: Specifiy column numbers (NEED TO TEST)
        # percentiles currently broken
        # mode requires mode_val and mode_bin separately
        if self._arraytype == None:
            self._array_typer()

        if stats == 'basic':
            stats = ('obs', 'mean', 'std', 'min', 'max')
        elif stats == 'all':
            #stats = self.univariate.keys()
            #dict doesn't keep an order, use full list instead
            stats = [
                'obs', 'mean', 'std', 'min', 'max', 'ptp', 'var', 'mode_val',
                'mode_bin', 'median', 'uss', 'skew', 'kurtosis', 'percentiles'
            ]
        else:
            for astat in stats:
                pass
                #assert astat in self.univariate

        #hack around percentiles multiple output

        #bad naming
        import scipy.stats

        #BUG: the following has all per the same per=99
        ##perdict = dict(('perc_%2d'%per, [lambda x:
        #      scipy.stats.scoreatpercentile(x, per), None, None])
        ##          for per in (1,5,10,25,50,75,90,95,99))

        def _fun(per):
            return lambda x: scipy.stats.scoreatpercentile(x, per)

        perdict = dict(('perc_%02d' % per, [_fun(per), None, None])
                       for per in (1, 5, 10, 25, 50, 75, 90, 95, 99))

        if 'percentiles' in stats:
            self.univariate.update(perdict)
            idx = stats.index('percentiles')
            stats[idx:idx + 1] = sorted(iterkeys(perdict))

        #JP: this doesn't allow a change in sequence, sequence in stats is
        #ignored
        #this is just an if condition
        if any([
                aitem[1] for aitem in iteritems(self.univariate)
                if aitem[0] in stats
        ]):
            if columns == 'all':
                self._columns_list = []
                if self._arraytype == 'sctruct':
                    self._columns_list = self.dataset.dtype.names
                    #self._columns_list = [col for col in
                    #                      self.dataset.dtype.names if
                    #(self._is_dtype_like(col)=='number')]
                else:
                    self._columns_list = lrange(self.dataset.shape[1])
            else:
                self._columns_list = columns
                if self._arraytype == 'sctruct':
                    for col in self._columns_list:
                        assert (col in self.dataset.dtype.names)
                else:
                    assert self._is_dtype_like(self.dataset) == 'number'

            columstypes = self.dataset.dtype
            #TODO: do we need to make sure they dtype is float64 ?
            for astat in stats:
                calc = self.univariate[astat]
                if self._arraytype == 'sctruct':
                    calc[1] = self._columns_list
                    calc[2] = [
                        calc[0](self.dataset[col])
                        for col in self._columns_list
                        if (self._is_dtype_like(col) == 'number')
                    ]
                    #calc[2].append([len(np.unique(self.dataset[col])) for col
                    #in self._columns_list if
                    #self._is_dtype_like(col)=='string']
                else:
                    calc[1] = ['Col ' + str(col) for col in self._columns_list]
                    calc[2] = [
                        calc[0](self.dataset[:, col])
                        for col in self._columns_list
                    ]
            return self.print_summary(stats, orientation=orientation)
        else:
            return self.print_summary(stats, orientation=orientation)
예제 #59
0
def _create_labels(rects, horizontal, ax, rotation):
    """find the position of the label for each value of each category

    right now it supports only up to the four categories

    ax: the axis on which the label should be applied
    rotation: the rotation list for each side
    """
    categories = _categories_level(list(iterkeys(rects)))
    if len(categories) > 4:
        msg = ("maximum of 4 level supported for axes labeling..and 4"
               "is alreay a lot of level, are you sure you need them all?")
        raise NotImplementedError(msg)
    labels = {}
    #keep it fixed as will be used a lot of times
    items = list(iteritems(rects))
    vertical = not horizontal

    #get the axis ticks and labels locator to put the correct values!
    ax2 = ax.twinx()
    ax3 = ax.twiny()
    #this is the order of execution for horizontal disposition
    ticks_pos = [ax.set_xticks, ax.set_yticks, ax3.set_xticks, ax2.set_yticks]
    ticks_lab = [ax.set_xticklabels, ax.set_yticklabels,
                 ax3.set_xticklabels, ax2.set_yticklabels]
    #for the vertical one, rotate it by one
    if vertical:
        ticks_pos = ticks_pos[1:] + ticks_pos[:1]
        ticks_lab = ticks_lab[1:] + ticks_lab[:1]
    #clean them
    for pos, lab in zip(ticks_pos, ticks_lab):
        pos([])
        lab([])
    #for each level, for each value in the level, take the mean of all
    #the sublevel that correspond to that partial key
    for level_idx, level in enumerate(categories):
        #this dictionary keep the labels only for this level
        level_ticks = dict()
        for value in level:
            #to which level it should refer to get the preceding
            #values of labels? it's rather a tricky question...
            #this is dependent on the side. It's a very crude management
            #but I couldn't think a more general way...
            if horizontal:
                if level_idx == 3:
                    index_select = [-1, -1, -1]
                else:
                    index_select = [+0, -1, -1]
            else:
                if level_idx == 3:
                    index_select = [+0, -1, +0]
                else:
                    index_select = [-1, -1, -1]
            #now I create the base key name and append the current value
            #It will search on all the rects to find the corresponding one
            #and use them to evaluate the mean position
            basekey = tuple(categories[i][index_select[i]]
                            for i in range(level_idx))
            basekey = basekey + (value,)
            subset = dict((k, v) for k, v in items
                          if basekey == k[:level_idx + 1])
            #now I extract the center of all the tiles and make a weighted
            #mean of all these center on the area of the tile
            #this should give me the (more or less) correct position
            #of the center of the category

            vals = list(itervalues(subset))
            W = sum(w * h for (x, y, w, h) in vals)
            x_lab = sum(_get_position(x, w, h, W) for (x, y, w, h) in vals)
            y_lab = sum(_get_position(y, h, w, W) for (x, y, w, h) in vals)
            #now base on the ordering, select which position to keep
            #needs to be written in a more general form of 4 level are enough?
            #should give also the horizontal and vertical alignment
            side = (level_idx + vertical) % 4
            level_ticks[value] = y_lab if side % 2 else x_lab
        #now we add the labels of this level to the correct axis

        ticks_pos[level_idx](list(itervalues(level_ticks)))
        ticks_lab[level_idx](list(iterkeys(level_ticks)),
                             rotation=rotation[level_idx])
    return labels