def quadratic_form_test( params: ArrayLike, cov: ArrayLike, restriction: OptionalArrayLike = None, value: OptionalArrayLike = None, formula: Optional[Union[str, List[str]]] = None, ) -> WaldTestStatistic: if formula is not None and restriction is not None: raise ValueError("restriction and formula cannot be used" "simultaneously.") if formula is not None: assert isinstance(params, Series) di = DesignInfo(list(params.index)) lc = di.linear_constraint(formula) restriction, value = lc.coefs, lc.constants restriction = np.asarray(restriction) if value is None: value = np.zeros(restriction.shape[0]) value = np.asarray(value).ravel()[:, None] diff = restriction @ np.asarray(params)[:, None] - value rcov = restriction @ cov @ restriction.T stat = float(diff.T @ np.linalg.inv(rcov) @ diff) df = restriction.shape[0] null = "Linear equality constraint is valid" name = "Linear Equality Hypothesis Test" return WaldTestStatistic(stat, null, df, name=name)
def quadratic_form_test(params, cov, restriction=None, value=None, formula=None): if formula is not None and restriction is not None: raise ValueError('restriction and formula cannot be used' 'simultaneously.') if formula is not None: di = DesignInfo(list(params.index)) lc = di.linear_constraint(formula) restriction, value = lc.coefs, lc.constants restriction = np.asarray(restriction) if value is None: value = np.zeros(restriction.shape[0]) value = np.asarray(value).ravel()[:, None] diff = restriction @ params.values[:, None] - value rcov = restriction @ cov @ restriction.T stat = float(diff.T @ np.linalg.inv(rcov) @ diff) df = restriction.shape[0] null = 'Linear equality constraint is valid' name = 'Linear Equality Hypothesis Test' return WaldTestStatistic(stat, null, df, name=name)
def _regularize_matrix(m, default_column_prefix): di = DesignInfo.from_array(m, default_column_prefix) if have_pandas and isinstance(m, (pandas.Series, pandas.DataFrame)): orig_index = m.index else: orig_index = None if return_type == "dataframe": m = atleast_2d_column_default(m, preserve_pandas=True) m = pandas.DataFrame(m) m.columns = di.column_names m.design_info = di return (m, orig_index) else: return (DesignMatrix(m, di), orig_index)
def test_DesignInfo_subset(): # For each combination of: # formula, term names, term objects, mixed term name and term objects # check that results match subset of full build # and that removed variables don't hurt all_data = {"x": [1, 2], "y": [[3.1, 3.2], [4.1, 4.2]], "z": [5, 6]} all_terms = make_termlist("x", "y", "z") def iter_maker(): yield all_data all_builder = design_matrix_builders([all_terms], iter_maker, 0)[0] full_matrix = build_design_matrices([all_builder], all_data)[0] def t(which_terms, variables, columns): sub_design_info = all_builder.subset(which_terms) sub_data = {} for variable in variables: sub_data[variable] = all_data[variable] sub_matrix = build_design_matrices([sub_design_info], sub_data)[0] sub_full_matrix = full_matrix[:, columns] if not isinstance(which_terms, six.string_types): assert len(which_terms) == len(sub_design_info.terms) assert np.array_equal(sub_matrix, sub_full_matrix) t("~ 0 + x + y + z", ["x", "y", "z"], slice(None)) t(["x", "y", "z"], ["x", "y", "z"], slice(None)) # Compatibility: six.PY2 wasn't added until 1.4.0, but six.PY3 exists in # all versions. if not six.PY3: t([unicode("x"), unicode("y"), unicode("z")], ["x", "y", "z"], slice(None)) t(all_terms, ["x", "y", "z"], slice(None)) t([all_terms[0], "y", all_terms[2]], ["x", "y", "z"], slice(None)) t("~ 0 + x + z", ["x", "z"], [0, 3]) t(["x", "z"], ["x", "z"], [0, 3]) # Compatibility: six.PY2 wasn't added until 1.4.0, but six.PY3 exists in # all versions. if not six.PY3: t([unicode("x"), unicode("z")], ["x", "z"], [0, 3]) t([all_terms[0], all_terms[2]], ["x", "z"], [0, 3]) t([all_terms[0], "z"], ["x", "z"], [0, 3]) t("~ 0 + z + x", ["x", "z"], [3, 0]) t(["z", "x"], ["x", "z"], [3, 0]) t([six.text_type("z"), six.text_type("x")], ["x", "z"], [3, 0]) t([all_terms[2], all_terms[0]], ["x", "z"], [3, 0]) t([all_terms[2], "x"], ["x", "z"], [3, 0]) t("~ 0 + y", ["y"], [1, 2]) t(["y"], ["y"], [1, 2]) t([six.text_type("y")], ["y"], [1, 2]) t([all_terms[1]], ["y"], [1, 2]) # Formula can't have a LHS pytest.raises(PatsyError, all_builder.subset, "a ~ a") # Term must exist pytest.raises(KeyError, all_builder.subset, "~ asdf") pytest.raises(KeyError, all_builder.subset, ["asdf"]) pytest.raises(KeyError, all_builder.subset, [Term(["asdf"])]) # Also check for a minimal DesignInfo (column names only) min_di = DesignInfo(["a", "b", "c"]) min_di_subset = min_di.subset(["c", "a"]) assert min_di_subset.column_names == ["c", "a"] assert min_di_subset.terms is None
def design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action="drop"): """Construct several :class:`DesignInfo` objects from termlists. This is one of Patsy's fundamental functions. This function and :func:`build_design_matrices` together form the API to the core formula interpretation machinery. :arg termlists: A list of termlists, where each termlist is a list of :class:`Term` objects which together specify a design matrix. :arg data_iter_maker: A zero-argument callable which returns an iterator over dict-like data objects. This must be a callable rather than a simple iterator because sufficiently complex formulas may require multiple passes over the data (e.g. if there are nested stateful transforms). :arg eval_env: Either a :class:`EvalEnvironment` which will be used to look up any variables referenced in `termlists` that cannot be found in `data_iter_maker`, or else a depth represented as an integer which will be passed to :meth:`EvalEnvironment.capture`. ``eval_env=0`` means to use the context of the function calling :func:`design_matrix_builders` for lookups. If calling this function from a library, you probably want ``eval_env=1``, which means that variables should be resolved in *your* caller's namespace. :arg NA_action: An :class:`NAAction` object or string, used to determine what values count as 'missing' for purposes of determining the levels of categorical factors. :returns: A list of :class:`DesignInfo` objects, one for each termlist passed in. This function performs zero or more iterations over the data in order to sniff out any necessary information about factor types, set up stateful transforms, pick column names, etc. See :ref:`formulas` for details. .. versionadded:: 0.2.0 The ``NA_action`` argument. .. versionadded:: 0.4.0 The ``eval_env`` argument. """ # People upgrading from versions prior to 0.4.0 could potentially have # passed NA_action as the 3rd positional argument. Fortunately # EvalEnvironment.capture only accepts int and EvalEnvironment objects, # and we improved its error messages to make this clear. eval_env = EvalEnvironment.capture(eval_env, reference=1) if isinstance(NA_action, str): NA_action = NAAction(NA_action) all_factors = set() for termlist in termlists: for term in termlist: all_factors.update(term.factors) factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env) # Now all the factors have working eval methods, so we can evaluate them # on some data to find out what type of data they return. (num_column_counts, cat_levels_contrasts) = _examine_factor_types(all_factors, factor_states, data_iter_maker, NA_action) # Now we need the factor infos, which encapsulate the knowledge of # how to turn any given factor into a chunk of data: factor_infos = {} for factor in all_factors: if factor in num_column_counts: fi = FactorInfo(factor, "numerical", factor_states[factor], num_columns=num_column_counts[factor], categories=None) else: assert factor in cat_levels_contrasts categories = cat_levels_contrasts[factor][0] fi = FactorInfo(factor, "categorical", factor_states[factor], num_columns=None, categories=categories) factor_infos[factor] = fi # And now we can construct the DesignInfo for each termlist: design_infos = [] for termlist in termlists: term_to_subterm_infos = _make_subterm_infos(termlist, num_column_counts, cat_levels_contrasts) assert isinstance(term_to_subterm_infos, OrderedDict) assert frozenset(term_to_subterm_infos) == frozenset(termlist) this_design_factor_infos = {} for term in termlist: for factor in term.factors: this_design_factor_infos[factor] = factor_infos[factor] column_names = [] for subterms in six.itervalues(term_to_subterm_infos): for subterm in subterms: for column_name in _subterm_column_names_iter( factor_infos, subterm): column_names.append(column_name) design_infos.append(DesignInfo(column_names, factor_infos=this_design_factor_infos, term_codings=term_to_subterm_infos)) return design_infos
def test_DesignInfo_subset(): # For each combination of: # formula, term names, term objects, mixed term name and term objects # check that results match subset of full build # and that removed variables don't hurt all_data = {"x": [1, 2], "y": [[3.1, 3.2], [4.1, 4.2]], "z": [5, 6]} all_terms = make_termlist("x", "y", "z") def iter_maker(): yield all_data all_builder = design_matrix_builders([all_terms], iter_maker, 0)[0] full_matrix = build_design_matrices([all_builder], all_data)[0] def t(which_terms, variables, columns): sub_design_info = all_builder.subset(which_terms) sub_data = {} for variable in variables: sub_data[variable] = all_data[variable] sub_matrix = build_design_matrices([sub_design_info], sub_data)[0] sub_full_matrix = full_matrix[:, columns] if not isinstance(which_terms, six.string_types): assert len(which_terms) == len(sub_design_info.terms) assert np.array_equal(sub_matrix, sub_full_matrix) t("~ 0 + x + y + z", ["x", "y", "z"], slice(None)) t(["x", "y", "z"], ["x", "y", "z"], slice(None)) # Compatibility: six.PY2 wasn't added until 1.4.0, but six.PY3 exists in # all versions. if not six.PY3: t([unicode("x"), unicode("y"), unicode("z")], ["x", "y", "z"], slice(None)) t(all_terms, ["x", "y", "z"], slice(None)) t([all_terms[0], "y", all_terms[2]], ["x", "y", "z"], slice(None)) t("~ 0 + x + z", ["x", "z"], [0, 3]) t(["x", "z"], ["x", "z"], [0, 3]) # Compatibility: six.PY2 wasn't added until 1.4.0, but six.PY3 exists in # all versions. if not six.PY3: t([unicode("x"), unicode("z")], ["x", "z"], [0, 3]) t([all_terms[0], all_terms[2]], ["x", "z"], [0, 3]) t([all_terms[0], "z"], ["x", "z"], [0, 3]) t("~ 0 + z + x", ["x", "z"], [3, 0]) t(["z", "x"], ["x", "z"], [3, 0]) t([six.text_type("z"), six.text_type("x")], ["x", "z"], [3, 0]) t([all_terms[2], all_terms[0]], ["x", "z"], [3, 0]) t([all_terms[2], "x"], ["x", "z"], [3, 0]) t("~ 0 + y", ["y"], [1, 2]) t(["y"], ["y"], [1, 2]) t([six.text_type("y")], ["y"], [1, 2]) t([all_terms[1]], ["y"], [1, 2]) # Formula can't have a LHS assert_raises(PatsyError, all_builder.subset, "a ~ a") # Term must exist assert_raises(KeyError, all_builder.subset, "~ asdf") assert_raises(KeyError, all_builder.subset, ["asdf"]) assert_raises(KeyError, all_builder.subset, [Term(["asdf"])]) # Also check for a minimal DesignInfo (column names only) min_di = DesignInfo(["a", "b", "c"]) min_di_subset = min_di.subset(["c", "a"]) assert min_di_subset.column_names == ["c", "a"] assert min_di_subset.terms is None
def design_info(self): """A :class:`DesignInfo` object giving information about the design matrices that this DesignMatrixBuilder can be used to create.""" return DesignInfo(self._column_names, self._term_slices, builder=self)