示例#1
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env):
    if isinstance(formula_like, DesignMatrixBuilder):
        return (design_matrix_builders([[]], data_iter_maker)[0],
                formula_like)
    if (isinstance(formula_like, tuple)
        and len(formula_like) == 2
        and isinstance(formula_like[0], DesignMatrixBuilder)
        and isinstance(formula_like[1], DesignMatrixBuilder)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__"
                                % (formula_like,))
        # fallthrough
    if isinstance(formula_like, basestring):
        eval_env = _get_env(eval_env)
        formula_like = ModelDesc.from_formula(formula_like, eval_env)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        return design_matrix_builders([formula_like.lhs_termlist,
                                       formula_like.rhs_termlist],
                                      data_iter_maker)
    else:
        return None
示例#2
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env,
                       NA_action):
    if isinstance(formula_like, DesignInfo):
        return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0],
                formula_like)
    if (isinstance(formula_like, tuple)
        and len(formula_like) == 2
        and isinstance(formula_like[0], DesignInfo)
        and isinstance(formula_like[1], DesignInfo)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__"
                                % (formula_like,))
        # fallthrough
    if isinstance(formula_like, str):
        formula_like = ModelDesc.from_formula(formula_like)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        assert isinstance(eval_env, EvalEnvironment)
        return design_matrix_builders([formula_like.lhs_termlist,
                                       formula_like.rhs_termlist],
                                      data_iter_maker,
                                      eval_env,
                                      NA_action)
    else:
        return None
示例#3
0
文件: mixed.py 项目: turbach/mixed
def eval_bar(evaluator, tree):
    """Evaluation function for the bar operator AST node."""

    assert len(tree.args) == 2
    expr_node, factor_node = tree.args

    # create model description for the expression left of the bar
    expr_node = ParseNode("~", None, [expr_node], expr_node.origin)
    expr_md = ModelDesc.from_formula(expr_node)

    # create model description for grouping factor right of the bar
    factor_node = ParseNode("~", None, [factor_node], factor_node.origin)
    factor_md = ModelDesc.from_formula(factor_node)
    factor_md.rhs_termlist.remove(INTERCEPT)

    # combine these in a random effects term
    ret = RandomEffectsTerm(expr=expr_md, factor=factor_md)

    # return corresponding intermediate expression
    return IntermediateExpr(False, None, False, [ret])
示例#4
0
def parse_formula(form):
    # use patsy for formula parse
    desc = ModelDesc.from_formula(form)

    # convert to string lists
    y_terms = [parse_term(t) for t in desc.lhs_termlist]
    x_terms = [parse_term(t) for t in desc.rhs_termlist]
    x_class = [classify_term(t) for t in x_terms]

    # separate into components
    y = squeeze_term(y_terms[0])
    x = [squeeze_term(t) for t, c in zip(x_terms, x_class) if c == 'continuous']
    fe = [squeeze_term(strip_cat(t)) for t, c in zip(x_terms, x_class) if c == 'categorical']
    intercept = any([c == 'intercept' for c in x_class])

    return y, x, fe, intercept
示例#5
0
def parse_formula(form):
    try:
        from patsy.desc import ModelDesc
    except:
        print('Please install patsy for formula parsing')
        return

    # use patsy for formula parse
    desc = ModelDesc.from_formula(form)
    lhs, rhs = desc.lhs_termlist, desc.rhs_termlist

    # convert to string lists
    x_terms = Formula(*[parse_term(t) for t in rhs])
    if len(lhs) > 0:
        y_terms = parse_factor(lhs[0].factors[0])
        return y_terms, x_terms
    else:
        return x_terms
示例#6
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env,
                       NA_action):
    if isinstance(formula_like, DesignInfo):
        return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0],
                formula_like)
    if (isinstance(formula_like, tuple)
        and len(formula_like) == 2
        and isinstance(formula_like[0], DesignInfo)
        and isinstance(formula_like[1], DesignInfo)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__"
                                % (formula_like,))
        # fallthrough
    if not six.PY3 and isinstance(formula_like, unicode):
        # Included for the convenience of people who are using py2 with
        # __future__.unicode_literals.
        try:
            formula_like = formula_like.encode("ascii")
        except UnicodeEncodeError:
            raise PatsyError(
                "On Python 2, formula strings must be either 'str' objects, "
                "or else 'unicode' objects containing only ascii "
                "characters. You passed a unicode string with non-ascii "
                "characters. I'm afraid you'll have to either switch to "
                "ascii-only, or else upgrade to Python 3.")
    if isinstance(formula_like, str):
        formula_like = ModelDesc.from_formula(formula_like)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        assert isinstance(eval_env, EvalEnvironment)
        return design_matrix_builders([formula_like.lhs_termlist,
                                       formula_like.rhs_termlist],
                                      data_iter_maker,
                                      eval_env,
                                      NA_action)
    else:
        return None
示例#7
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action):
    if isinstance(formula_like, DesignMatrixBuilder):
        return (design_matrix_builders([[]], data_iter_maker,
                                       NA_action)[0], formula_like)
    if (isinstance(formula_like, tuple) and len(formula_like) == 2
            and isinstance(formula_like[0], DesignMatrixBuilder)
            and isinstance(formula_like[1], DesignMatrixBuilder)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__" %
                             (formula_like, ))
        # fallthrough
    if isinstance(formula_like, basestring):
        assert isinstance(eval_env, EvalEnvironment)
        formula_like = ModelDesc.from_formula(formula_like, eval_env)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        return design_matrix_builders(
            [formula_like.lhs_termlist, formula_like.rhs_termlist],
            data_iter_maker, NA_action)
    else:
        return None
示例#8
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action):
    if isinstance(formula_like, DesignInfo):
        return (design_matrix_builders([[]], data_iter_maker, eval_env,
                                       NA_action)[0], formula_like)
    if (isinstance(formula_like, tuple) and len(formula_like) == 2
            and isinstance(formula_like[0], DesignInfo)
            and isinstance(formula_like[1], DesignInfo)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__" %
                             (formula_like, ))
        # fallthrough
    if not six.PY3 and isinstance(formula_like, unicode):
        # Included for the convenience of people who are using py2 with
        # __future__.unicode_literals.
        try:
            formula_like = formula_like.encode("ascii")
        except UnicodeEncodeError:
            raise PatsyError(
                "On Python 2, formula strings must be either 'str' objects, "
                "or else 'unicode' objects containing only ascii "
                "characters. You passed a unicode string with non-ascii "
                "characters. I'm afraid you'll have to either switch to "
                "ascii-only, or else upgrade to Python 3.")
    if isinstance(formula_like, str):
        formula_like = ModelDesc.from_formula(formula_like)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        assert isinstance(eval_env, EvalEnvironment)
        return design_matrix_builders(
            [formula_like.lhs_termlist, formula_like.rhs_termlist],
            data_iter_maker, eval_env, NA_action)
    else:
        return None
示例#9
0
    def subset(self, which_terms):
        """Create a new :class:`DesignInfo` for design matrices that contain a
        subset of the terms that the current :class:`DesignInfo` does.

        For example, if ``design_info`` has terms ``x``, ``y``, and ``z``,
        then::

          design_info2 = design_info.subset(["x", "z"])

        will return a new DesignInfo that can be used to construct design
        matrices with only the columns corresponding to the terms ``x`` and
        ``z``. After we do this, then in general these two expressions will
        return the same thing (here we assume that ``x``, ``y``, and ``z``
        each generate a single column of the output)::

          build_design_matrix([design_info], data)[0][:, [0, 2]]
          build_design_matrix([design_info2], data)[0]

        However, a critical difference is that in the second case, ``data``
        need not contain any values for ``y``. This is very useful when doing
        prediction using a subset of a model, in which situation R usually
        forces you to specify dummy values for ``y``.

        If using a formula to specify the terms to include, remember that like
        any formula, the intercept term will be included by default, so use
        ``0`` or ``-1`` in your formula if you want to avoid this.

        This method can also be used to reorder the terms in your design
        matrix, in case you want to do that for some reason. I can't think of
        any.

        Note that this method will generally *not* produce the same result as
        creating a new model directly. Consider these DesignInfo objects::

            design1 = dmatrix("1 + C(a)", data)
            design2 = design1.subset("0 + C(a)")
            design3 = dmatrix("0 + C(a)", data)

        Here ``design2`` and ``design3`` will both produce design matrices
        that contain an encoding of ``C(a)`` without any intercept term. But
        ``design3`` uses a full-rank encoding for the categorical term
        ``C(a)``, while ``design2`` uses the same reduced-rank encoding as
        ``design1``.

        :arg which_terms: The terms which should be kept in the new
          :class:`DesignMatrixBuilder`. If this is a string, then it is parsed
          as a formula, and then the names of the resulting terms are taken as
          the terms to keep. If it is a list, then it can contain a mixture of
          term names (as strings) and :class:`Term` objects.

        .. versionadded: 0.2.0
           New method on the class DesignMatrixBuilder.

        .. versionchanged: 0.4.0
           Moved from DesignMatrixBuilder to DesignInfo, as part of the
           removal of DesignMatrixBuilder.

        """
        if isinstance(which_terms, str):
            desc = ModelDesc.from_formula(which_terms)
            if desc.lhs_termlist:
                raise PatsyError("right-hand-side-only formula required")
            which_terms = [term.name() for term in desc.rhs_termlist]

        if self.term_codings is None:
            # This is a minimal DesignInfo
            # If the name is unknown we just let the KeyError escape
            new_names = []
            for t in which_terms:
                new_names += self.column_names[self.term_name_slices[t]]
            return DesignInfo(new_names)
        else:
            term_name_to_term = {}
            for term in self.term_codings:
                term_name_to_term[term.name()] = term

            new_column_names = []
            new_factor_infos = {}
            new_term_codings = OrderedDict()
            for name_or_term in which_terms:
                term = term_name_to_term.get(name_or_term, name_or_term)
                # If the name is unknown we just let the KeyError escape
                s = self.term_slices[term]
                new_column_names += self.column_names[s]
                for f in term.factors:
                    new_factor_infos[f] = self.factor_infos[f]
                new_term_codings[term] = self.term_codings[term]
            return DesignInfo(new_column_names,
                              factor_infos=new_factor_infos,
                              term_codings=new_term_codings)
示例#10
0
    def subset(self, which_terms):
        """Create a new :class:`DesignInfo` for design matrices that contain a
        subset of the terms that the current :class:`DesignInfo` does.

        For example, if ``design_info`` has terms ``x``, ``y``, and ``z``,
        then::

          design_info2 = design_info.subset(["x", "z"])

        will return a new DesignInfo that can be used to construct design
        matrices with only the columns corresponding to the terms ``x`` and
        ``z``. After we do this, then in general these two expressions will
        return the same thing (here we assume that ``x``, ``y``, and ``z``
        each generate a single column of the output)::

          build_design_matrix([design_info], data)[0][:, [0, 2]]
          build_design_matrix([design_info2], data)[0]

        However, a critical difference is that in the second case, ``data``
        need not contain any values for ``y``. This is very useful when doing
        prediction using a subset of a model, in which situation R usually
        forces you to specify dummy values for ``y``.

        If using a formula to specify the terms to include, remember that like
        any formula, the intercept term will be included by default, so use
        ``0`` or ``-1`` in your formula if you want to avoid this.

        This method can also be used to reorder the terms in your design
        matrix, in case you want to do that for some reason. I can't think of
        any.

        Note that this method will generally *not* produce the same result as
        creating a new model directly. Consider these DesignInfo objects::

            design1 = dmatrix("1 + C(a)", data)
            design2 = design1.subset("0 + C(a)")
            design3 = dmatrix("0 + C(a)", data)

        Here ``design2`` and ``design3`` will both produce design matrices
        that contain an encoding of ``C(a)`` without any intercept term. But
        ``design3`` uses a full-rank encoding for the categorical term
        ``C(a)``, while ``design2`` uses the same reduced-rank encoding as
        ``design1``.

        :arg which_terms: The terms which should be kept in the new
          :class:`DesignMatrixBuilder`. If this is a string, then it is parsed
          as a formula, and then the names of the resulting terms are taken as
          the terms to keep. If it is a list, then it can contain a mixture of
          term names (as strings) and :class:`Term` objects.

        .. versionadded: 0.2.0
           New method on the class DesignMatrixBuilder.

        .. versionchanged: 0.4.0
           Moved from DesignMatrixBuilder to DesignInfo, as part of the
           removal of DesignMatrixBuilder.

        """
        if isinstance(which_terms, str):
            desc = ModelDesc.from_formula(which_terms)
            if desc.lhs_termlist:
                raise PatsyError("right-hand-side-only formula required")
            which_terms = [term.name() for term in desc.rhs_termlist]

        if self.term_codings is None:
            # This is a minimal DesignInfo
            # If the name is unknown we just let the KeyError escape
            new_names = []
            for t in which_terms:
                new_names += self.column_names[self.term_name_slices[t]]
            return DesignInfo(new_names)
        else:
            term_name_to_term = {}
            for term in self.term_codings:
                term_name_to_term[term.name()] = term

            new_column_names = []
            new_factor_infos = {}
            new_term_codings = OrderedDict()
            for name_or_term in which_terms:
                term = term_name_to_term.get(name_or_term, name_or_term)
                # If the name is unknown we just let the KeyError escape
                s = self.term_slices[term]
                new_column_names += self.column_names[s]
                for f in term.factors:
                    new_factor_infos[f] = self.factor_infos[f]
                new_term_codings[term] = self.term_codings[term]
            return DesignInfo(new_column_names,
                              factor_infos=new_factor_infos,
                              term_codings=new_term_codings)
示例#11
0
    def subset(self, which_terms):
        """Create a new :class:`DesignMatrixBuilder` that includes only a
        subset of the terms that this object does.

        For example, if `builder` has terms `x`, `y`, and `z`, then::

          builder2 = builder.subset(["x", "z"])

        will return a new builder that will return design matrices with only
        the columns corresponding to the terms `x` and `z`. After we do this,
        then in general these two expressions will return the same thing (here
        we assume that `x`, `y`, and `z` each generate a single column of the
        output)::

          build_design_matrix([builder], data)[0][:, [0, 2]]
          build_design_matrix([builder2], data)[0]

        However, a critical difference is that in the second case, `data` need
        not contain any values for `y`. This is very useful when doing
        prediction using a subset of a model, in which situation R usually
        forces you to specify dummy values for `y`.

        If using a formula to specify the terms to include, remember that like
        any formula, the intercept term will be included by default, so use
        `0` or `-1` in your formula if you want to avoid this.

        :arg which_terms: The terms which should be kept in the new
          :class:`DesignMatrixBuilder`. If this is a string, then it is parsed
          as a formula, and then the names of the resulting terms are taken as
          the terms to keep. If it is a list, then it can contain a mixture of
          term names (as strings) and :class:`Term` objects.

        .. versionadded: 0.2.0
        """
        factor_to_evaluators = {}
        for evaluator in self._evaluators:
            factor_to_evaluators[evaluator.factor] = evaluator
        design_info = self.design_info
        term_name_to_term = dict(zip(design_info.term_names,
                                     design_info.terms))
        if isinstance(which_terms, str):
            # We don't use this EvalEnvironment -- all we want to do is to
            # find matching terms, and we can't do that use == on Term
            # objects, because that calls == on factor objects, which in turn
            # compares EvalEnvironments. So all we do with the parsed formula
            # is pull out the term *names*, which the EvalEnvironment doesn't
            # effect. This is just a placeholder then to allow the ModelDesc
            # to be created:
            env = EvalEnvironment({})
            desc = ModelDesc.from_formula(which_terms, env)
            if desc.lhs_termlist:
                raise PatsyError("right-hand-side-only formula required")
            which_terms = [term.name() for term in desc.rhs_termlist]
        terms = []
        evaluators = set()
        term_to_column_builders = {}
        for term_or_name in which_terms:
            if isinstance(term_or_name, six.string_types):
                if term_or_name not in term_name_to_term:
                    raise PatsyError("requested term %r not found in "
                                     "this DesignMatrixBuilder"
                                     % (term_or_name,))
                term = term_name_to_term[term_or_name]
            else:
                term = term_or_name
            if term not in self._termlist:
                raise PatsyError("requested term '%s' not found in this "
                                 "DesignMatrixBuilder" % (term,))
            for factor in term.factors:
                evaluators.add(factor_to_evaluators[factor])
            terms.append(term)
            column_builder = self._term_to_column_builders[term]
            term_to_column_builders[term] = column_builder
        return DesignMatrixBuilder(terms,
                                   evaluators,
                                   term_to_column_builders)
示例#12
0
文件: build.py 项目: joaonatali/patsy
    def subset(self, which_terms):
        """Create a new :class:`DesignMatrixBuilder` that includes only a
        subset of the terms that this object does.

        For example, if `builder` has terms `x`, `y`, and `z`, then::

          builder2 = builder.subset(["x", "z"])

        will return a new builder that will return design matrices with only
        the columns corresponding to the terms `x` and `z`. After we do this,
        then in general these two expressions will return the same thing (here
        we assume that `x`, `y`, and `z` each generate a single column of the
        output)::

          build_design_matrix([builder], data)[0][:, [0, 2]]
          build_design_matrix([builder2], data)[0]

        However, a critical difference is that in the second case, `data` need
        not contain any values for `y`. This is very useful when doing
        prediction using a subset of a model, in which situation R usually
        forces you to specify dummy values for `y`.

        If using a formula to specify the terms to include, remember that like
        any formula, the intercept term will be included by default, so use
        `0` or `-1` in your formula if you want to avoid this.

        :arg which_terms: The terms which should be kept in the new
          :class:`DesignMatrixBuilder`. If this is a string, then it is parsed
          as a formula, and then the names of the resulting terms are taken as
          the terms to keep. If it is a list, then it can contain a mixture of
          term names (as strings) and :class:`Term` objects.

        .. versionadded: 0.2.0
        """
        factor_to_evaluators = {}
        for evaluator in self._evaluators:
            factor_to_evaluators[evaluator.factor] = evaluator
        design_info = self.design_info
        term_name_to_term = dict(zip(design_info.term_names,
                                     design_info.terms))
        if isinstance(which_terms, basestring):
            # We don't use this EvalEnvironment -- all we want to do is to
            # find matching terms, and we can't do that use == on Term
            # objects, because that calls == on factor objects, which in turn
            # compares EvalEnvironments. So all we do with the parsed formula
            # is pull out the term *names*, which the EvalEnvironment doesn't
            # effect. This is just a placeholder then to allow the ModelDesc
            # to be created:
            env = EvalEnvironment({})
            desc = ModelDesc.from_formula(which_terms, env)
            if desc.lhs_termlist:
                raise PatsyError("right-hand-side-only formula required")
            which_terms = [term.name() for term in desc.rhs_termlist]
        terms = []
        evaluators = set()
        term_to_column_builders = {}
        for term_or_name in which_terms:
            if isinstance(term_or_name, basestring):
                if term_or_name not in term_name_to_term:
                    raise PatsyError("requested term %r not found in "
                                     "this DesignMatrixBuilder" %
                                     (term_or_name, ))
                term = term_name_to_term[term_or_name]
            else:
                term = term_or_name
            if term not in self._termlist:
                raise PatsyError("requested term '%s' not found in this "
                                 "DesignMatrixBuilder" % (term, ))
            for factor in term.factors:
                evaluators.add(factor_to_evaluators[factor])
            terms.append(term)
            column_builder = self._term_to_column_builders[term]
            term_to_column_builders[term] = column_builder
        return DesignMatrixBuilder(terms, evaluators, term_to_column_builders)