def _try_incr_builders(formula_like, data_iter_maker, eval_env): if isinstance(formula_like, DesignMatrixBuilder): return (design_matrix_builders([[]], data_iter_maker)[0], formula_like) if (isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignMatrixBuilder) and isinstance(formula_like[1], DesignMatrixBuilder)): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): raise PatsyError("bad value from %r.__patsy_get_model_desc__" % (formula_like,)) # fallthrough if isinstance(formula_like, basestring): eval_env = _get_env(eval_env) formula_like = ModelDesc.from_formula(formula_like, eval_env) # fallthrough if isinstance(formula_like, ModelDesc): return design_matrix_builders([formula_like.lhs_termlist, formula_like.rhs_termlist], data_iter_maker) else: return None
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action): if isinstance(formula_like, DesignInfo): return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0], formula_like) if (isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignInfo) and isinstance(formula_like[1], DesignInfo)): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): raise PatsyError("bad value from %r.__patsy_get_model_desc__" % (formula_like,)) # fallthrough if isinstance(formula_like, str): formula_like = ModelDesc.from_formula(formula_like) # fallthrough if isinstance(formula_like, ModelDesc): assert isinstance(eval_env, EvalEnvironment) return design_matrix_builders([formula_like.lhs_termlist, formula_like.rhs_termlist], data_iter_maker, eval_env, NA_action) else: return None
def eval_bar(evaluator, tree): """Evaluation function for the bar operator AST node.""" assert len(tree.args) == 2 expr_node, factor_node = tree.args # create model description for the expression left of the bar expr_node = ParseNode("~", None, [expr_node], expr_node.origin) expr_md = ModelDesc.from_formula(expr_node) # create model description for grouping factor right of the bar factor_node = ParseNode("~", None, [factor_node], factor_node.origin) factor_md = ModelDesc.from_formula(factor_node) factor_md.rhs_termlist.remove(INTERCEPT) # combine these in a random effects term ret = RandomEffectsTerm(expr=expr_md, factor=factor_md) # return corresponding intermediate expression return IntermediateExpr(False, None, False, [ret])
def parse_formula(form): # use patsy for formula parse desc = ModelDesc.from_formula(form) # convert to string lists y_terms = [parse_term(t) for t in desc.lhs_termlist] x_terms = [parse_term(t) for t in desc.rhs_termlist] x_class = [classify_term(t) for t in x_terms] # separate into components y = squeeze_term(y_terms[0]) x = [squeeze_term(t) for t, c in zip(x_terms, x_class) if c == 'continuous'] fe = [squeeze_term(strip_cat(t)) for t, c in zip(x_terms, x_class) if c == 'categorical'] intercept = any([c == 'intercept' for c in x_class]) return y, x, fe, intercept
def parse_formula(form): try: from patsy.desc import ModelDesc except: print('Please install patsy for formula parsing') return # use patsy for formula parse desc = ModelDesc.from_formula(form) lhs, rhs = desc.lhs_termlist, desc.rhs_termlist # convert to string lists x_terms = Formula(*[parse_term(t) for t in rhs]) if len(lhs) > 0: y_terms = parse_factor(lhs[0].factors[0]) return y_terms, x_terms else: return x_terms
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action): if isinstance(formula_like, DesignInfo): return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0], formula_like) if (isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignInfo) and isinstance(formula_like[1], DesignInfo)): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): raise PatsyError("bad value from %r.__patsy_get_model_desc__" % (formula_like,)) # fallthrough if not six.PY3 and isinstance(formula_like, unicode): # Included for the convenience of people who are using py2 with # __future__.unicode_literals. try: formula_like = formula_like.encode("ascii") except UnicodeEncodeError: raise PatsyError( "On Python 2, formula strings must be either 'str' objects, " "or else 'unicode' objects containing only ascii " "characters. You passed a unicode string with non-ascii " "characters. I'm afraid you'll have to either switch to " "ascii-only, or else upgrade to Python 3.") if isinstance(formula_like, str): formula_like = ModelDesc.from_formula(formula_like) # fallthrough if isinstance(formula_like, ModelDesc): assert isinstance(eval_env, EvalEnvironment) return design_matrix_builders([formula_like.lhs_termlist, formula_like.rhs_termlist], data_iter_maker, eval_env, NA_action) else: return None
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action): if isinstance(formula_like, DesignMatrixBuilder): return (design_matrix_builders([[]], data_iter_maker, NA_action)[0], formula_like) if (isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignMatrixBuilder) and isinstance(formula_like[1], DesignMatrixBuilder)): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): raise PatsyError("bad value from %r.__patsy_get_model_desc__" % (formula_like, )) # fallthrough if isinstance(formula_like, basestring): assert isinstance(eval_env, EvalEnvironment) formula_like = ModelDesc.from_formula(formula_like, eval_env) # fallthrough if isinstance(formula_like, ModelDesc): return design_matrix_builders( [formula_like.lhs_termlist, formula_like.rhs_termlist], data_iter_maker, NA_action) else: return None
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action): if isinstance(formula_like, DesignInfo): return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0], formula_like) if (isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignInfo) and isinstance(formula_like[1], DesignInfo)): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): raise PatsyError("bad value from %r.__patsy_get_model_desc__" % (formula_like, )) # fallthrough if not six.PY3 and isinstance(formula_like, unicode): # Included for the convenience of people who are using py2 with # __future__.unicode_literals. try: formula_like = formula_like.encode("ascii") except UnicodeEncodeError: raise PatsyError( "On Python 2, formula strings must be either 'str' objects, " "or else 'unicode' objects containing only ascii " "characters. You passed a unicode string with non-ascii " "characters. I'm afraid you'll have to either switch to " "ascii-only, or else upgrade to Python 3.") if isinstance(formula_like, str): formula_like = ModelDesc.from_formula(formula_like) # fallthrough if isinstance(formula_like, ModelDesc): assert isinstance(eval_env, EvalEnvironment) return design_matrix_builders( [formula_like.lhs_termlist, formula_like.rhs_termlist], data_iter_maker, eval_env, NA_action) else: return None
def subset(self, which_terms): """Create a new :class:`DesignInfo` for design matrices that contain a subset of the terms that the current :class:`DesignInfo` does. For example, if ``design_info`` has terms ``x``, ``y``, and ``z``, then:: design_info2 = design_info.subset(["x", "z"]) will return a new DesignInfo that can be used to construct design matrices with only the columns corresponding to the terms ``x`` and ``z``. After we do this, then in general these two expressions will return the same thing (here we assume that ``x``, ``y``, and ``z`` each generate a single column of the output):: build_design_matrix([design_info], data)[0][:, [0, 2]] build_design_matrix([design_info2], data)[0] However, a critical difference is that in the second case, ``data`` need not contain any values for ``y``. This is very useful when doing prediction using a subset of a model, in which situation R usually forces you to specify dummy values for ``y``. If using a formula to specify the terms to include, remember that like any formula, the intercept term will be included by default, so use ``0`` or ``-1`` in your formula if you want to avoid this. This method can also be used to reorder the terms in your design matrix, in case you want to do that for some reason. I can't think of any. Note that this method will generally *not* produce the same result as creating a new model directly. Consider these DesignInfo objects:: design1 = dmatrix("1 + C(a)", data) design2 = design1.subset("0 + C(a)") design3 = dmatrix("0 + C(a)", data) Here ``design2`` and ``design3`` will both produce design matrices that contain an encoding of ``C(a)`` without any intercept term. But ``design3`` uses a full-rank encoding for the categorical term ``C(a)``, while ``design2`` uses the same reduced-rank encoding as ``design1``. :arg which_terms: The terms which should be kept in the new :class:`DesignMatrixBuilder`. If this is a string, then it is parsed as a formula, and then the names of the resulting terms are taken as the terms to keep. If it is a list, then it can contain a mixture of term names (as strings) and :class:`Term` objects. .. versionadded: 0.2.0 New method on the class DesignMatrixBuilder. .. versionchanged: 0.4.0 Moved from DesignMatrixBuilder to DesignInfo, as part of the removal of DesignMatrixBuilder. """ if isinstance(which_terms, str): desc = ModelDesc.from_formula(which_terms) if desc.lhs_termlist: raise PatsyError("right-hand-side-only formula required") which_terms = [term.name() for term in desc.rhs_termlist] if self.term_codings is None: # This is a minimal DesignInfo # If the name is unknown we just let the KeyError escape new_names = [] for t in which_terms: new_names += self.column_names[self.term_name_slices[t]] return DesignInfo(new_names) else: term_name_to_term = {} for term in self.term_codings: term_name_to_term[term.name()] = term new_column_names = [] new_factor_infos = {} new_term_codings = OrderedDict() for name_or_term in which_terms: term = term_name_to_term.get(name_or_term, name_or_term) # If the name is unknown we just let the KeyError escape s = self.term_slices[term] new_column_names += self.column_names[s] for f in term.factors: new_factor_infos[f] = self.factor_infos[f] new_term_codings[term] = self.term_codings[term] return DesignInfo(new_column_names, factor_infos=new_factor_infos, term_codings=new_term_codings)
def subset(self, which_terms): """Create a new :class:`DesignMatrixBuilder` that includes only a subset of the terms that this object does. For example, if `builder` has terms `x`, `y`, and `z`, then:: builder2 = builder.subset(["x", "z"]) will return a new builder that will return design matrices with only the columns corresponding to the terms `x` and `z`. After we do this, then in general these two expressions will return the same thing (here we assume that `x`, `y`, and `z` each generate a single column of the output):: build_design_matrix([builder], data)[0][:, [0, 2]] build_design_matrix([builder2], data)[0] However, a critical difference is that in the second case, `data` need not contain any values for `y`. This is very useful when doing prediction using a subset of a model, in which situation R usually forces you to specify dummy values for `y`. If using a formula to specify the terms to include, remember that like any formula, the intercept term will be included by default, so use `0` or `-1` in your formula if you want to avoid this. :arg which_terms: The terms which should be kept in the new :class:`DesignMatrixBuilder`. If this is a string, then it is parsed as a formula, and then the names of the resulting terms are taken as the terms to keep. If it is a list, then it can contain a mixture of term names (as strings) and :class:`Term` objects. .. versionadded: 0.2.0 """ factor_to_evaluators = {} for evaluator in self._evaluators: factor_to_evaluators[evaluator.factor] = evaluator design_info = self.design_info term_name_to_term = dict(zip(design_info.term_names, design_info.terms)) if isinstance(which_terms, str): # We don't use this EvalEnvironment -- all we want to do is to # find matching terms, and we can't do that use == on Term # objects, because that calls == on factor objects, which in turn # compares EvalEnvironments. So all we do with the parsed formula # is pull out the term *names*, which the EvalEnvironment doesn't # effect. This is just a placeholder then to allow the ModelDesc # to be created: env = EvalEnvironment({}) desc = ModelDesc.from_formula(which_terms, env) if desc.lhs_termlist: raise PatsyError("right-hand-side-only formula required") which_terms = [term.name() for term in desc.rhs_termlist] terms = [] evaluators = set() term_to_column_builders = {} for term_or_name in which_terms: if isinstance(term_or_name, six.string_types): if term_or_name not in term_name_to_term: raise PatsyError("requested term %r not found in " "this DesignMatrixBuilder" % (term_or_name,)) term = term_name_to_term[term_or_name] else: term = term_or_name if term not in self._termlist: raise PatsyError("requested term '%s' not found in this " "DesignMatrixBuilder" % (term,)) for factor in term.factors: evaluators.add(factor_to_evaluators[factor]) terms.append(term) column_builder = self._term_to_column_builders[term] term_to_column_builders[term] = column_builder return DesignMatrixBuilder(terms, evaluators, term_to_column_builders)
def subset(self, which_terms): """Create a new :class:`DesignMatrixBuilder` that includes only a subset of the terms that this object does. For example, if `builder` has terms `x`, `y`, and `z`, then:: builder2 = builder.subset(["x", "z"]) will return a new builder that will return design matrices with only the columns corresponding to the terms `x` and `z`. After we do this, then in general these two expressions will return the same thing (here we assume that `x`, `y`, and `z` each generate a single column of the output):: build_design_matrix([builder], data)[0][:, [0, 2]] build_design_matrix([builder2], data)[0] However, a critical difference is that in the second case, `data` need not contain any values for `y`. This is very useful when doing prediction using a subset of a model, in which situation R usually forces you to specify dummy values for `y`. If using a formula to specify the terms to include, remember that like any formula, the intercept term will be included by default, so use `0` or `-1` in your formula if you want to avoid this. :arg which_terms: The terms which should be kept in the new :class:`DesignMatrixBuilder`. If this is a string, then it is parsed as a formula, and then the names of the resulting terms are taken as the terms to keep. If it is a list, then it can contain a mixture of term names (as strings) and :class:`Term` objects. .. versionadded: 0.2.0 """ factor_to_evaluators = {} for evaluator in self._evaluators: factor_to_evaluators[evaluator.factor] = evaluator design_info = self.design_info term_name_to_term = dict(zip(design_info.term_names, design_info.terms)) if isinstance(which_terms, basestring): # We don't use this EvalEnvironment -- all we want to do is to # find matching terms, and we can't do that use == on Term # objects, because that calls == on factor objects, which in turn # compares EvalEnvironments. So all we do with the parsed formula # is pull out the term *names*, which the EvalEnvironment doesn't # effect. This is just a placeholder then to allow the ModelDesc # to be created: env = EvalEnvironment({}) desc = ModelDesc.from_formula(which_terms, env) if desc.lhs_termlist: raise PatsyError("right-hand-side-only formula required") which_terms = [term.name() for term in desc.rhs_termlist] terms = [] evaluators = set() term_to_column_builders = {} for term_or_name in which_terms: if isinstance(term_or_name, basestring): if term_or_name not in term_name_to_term: raise PatsyError("requested term %r not found in " "this DesignMatrixBuilder" % (term_or_name, )) term = term_name_to_term[term_or_name] else: term = term_or_name if term not in self._termlist: raise PatsyError("requested term '%s' not found in this " "DesignMatrixBuilder" % (term, )) for factor in term.factors: evaluators.add(factor_to_evaluators[factor]) terms.append(term) column_builder = self._term_to_column_builders[term] term_to_column_builders[term] = column_builder return DesignMatrixBuilder(terms, evaluators, term_to_column_builders)