def test_data_types(): data = { "a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": np.asarray([1, 2, 3], dtype=np.float32), "d": [True, False, True], "e": ["foo", "bar", "baz"], "f": C([1, 2, 3]), "g": C(["foo", "bar", "baz"]), "h": np.array(["foo", 1, (1, "hi")], dtype=object), } t("~ 0 + a", data, 0, True, [[1], [2], [3]], ["a"]) t("~ 0 + b", data, 0, True, [[1], [2], [3]], ["b"]) t("~ 0 + c", data, 0, True, [[1], [2], [3]], ["c"]) t("~ 0 + d", data, 0, True, [[0, 1], [1, 0], [0, 1]], ["d[False]", "d[True]"]) t("~ 0 + e", data, 0, True, [[0, 0, 1], [1, 0, 0], [0, 1, 0]], ["e[bar]", "e[baz]", "e[foo]"]) t("~ 0 + f", data, 0, True, [[1, 0, 0], [0, 1, 0], [0, 0, 1]], ["f[1]", "f[2]", "f[3]"]) t("~ 0 + g", data, 0, True, [[0, 0, 1], [1, 0, 0], [0, 1, 0]], ["g[bar]", "g[baz]", "g[foo]"]) # This depends on Python's sorting behavior: t("~ 0 + h", data, 0, True, [[0, 1, 0], [1, 0, 0], [0, 0, 1]], ["h[1]", "h[foo]", "h[(1, 'hi')]"])
def test_data_mismatch(): test_cases_twoway = [ # Data type mismatch ([1, 2, 3], [True, False, True]), (C(["a", "b", "c"], levels=["c", "b", "a"]), C(["a", "b", "c"], levels=["a", "b", "c"])), # column number mismatches ([[1], [2], [3]], [[1, 1], [2, 2], [3, 3]]), ([[1, 1, 1], [2, 2, 2], [3, 3, 3]], [[1, 1], [2, 2], [3, 3]]), ] test_cases_oneway = [ ([1, 2, 3], ["a", "b", "c"]), ([1, 2, 3], C(["a", "b", "c"])), ([True, False, True], C(["a", "b", "c"])), ([True, False, True], ["a", "b", "c"]), ] setup_predict_only = [ # This is not an error if both are fed in during make_builders, but it # is an error to pass one to make_builders and the other to # make_matrices. (["a", "b", "c"], ["a", "b", "d"]), ] termlist = make_termlist(["x"]) def t_incremental(data1, data2): def iter_maker(): yield {"x": data1} yield {"x": data2} try: builders = design_matrix_builders([termlist], iter_maker, 0) build_design_matrices(builders, {"x": data1}) build_design_matrices(builders, {"x": data2}) except PatsyError: pass else: raise AssertionError def t_setup_predict(data1, data2): def iter_maker(): yield {"x": data1} builders = design_matrix_builders([termlist], iter_maker, 0) assert_raises(PatsyError, build_design_matrices, builders, {"x": data2}) for (a, b) in test_cases_twoway: t_incremental(a, b) t_incremental(b, a) t_setup_predict(a, b) t_setup_predict(b, a) for (a, b) in test_cases_oneway: t_incremental(a, b) t_setup_predict(a, b) for (a, b) in setup_predict_only: t_setup_predict(a, b) t_setup_predict(b, a) assert_raises(PatsyError, make_matrix, {"x": [1, 2, 3], "y": [1, 2, 3, 4]}, 2, [["x"], ["y"]])
def test_contrast(): from patsy.contrasts import ContrastMatrix, Sum values = ["a1", "a3", "a1", "a2"] # No intercept in model, full-rank coding of 'a' m = make_matrix({"a": C(values)}, 3, [["a"]], column_names=["a[a1]", "a[a2]", "a[a3]"]) assert np.allclose(m, [[1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]]) for s in (Sum, Sum()): m = make_matrix({"a": C(values, s)}, 3, [["a"]], column_names=["a[mean]", "a[S.a1]", "a[S.a2]"]) # Output from R assert np.allclose(m, [[1, 1, 0], [1, -1, -1], [1, 1, 0], [1, 0, 1]]) m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [["a"]], column_names=["a[mean]", "a[S.a2]", "a[S.a3]"]) # Output from R assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]]) # Intercept in model, non-full-rank coding of 'a' m = make_matrix({"a": C(values)}, 3, [[], ["a"]], column_names=["Intercept", "a[T.a2]", "a[T.a3]"]) assert np.allclose(m, [[1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 1, 0]]) for s in (Sum, Sum()): m = make_matrix({"a": C(values, s)}, 3, [[], ["a"]], column_names=["Intercept", "a[S.a1]", "a[S.a2]"]) # Output from R assert np.allclose(m, [[1, 1, 0], [1, -1, -1], [1, 1, 0], [1, 0, 1]]) m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [[], ["a"]], column_names=["Intercept", "a[S.a2]", "a[S.a3]"]) # Output from R assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]]) # Weird ad hoc less-than-full-rank coding of 'a' m = make_matrix({"a": C(values, [[7, 12], [2, 13], [8, -1]])}, 2, [["a"]], column_names=["a[custom0]", "a[custom1]"]) assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]]) m = make_matrix( { "a": C(values, ContrastMatrix([[7, 12], [2, 13], [8, -1]], ["[foo]", "[bar]"])) }, 2, [["a"]], column_names=["a[foo]", "a[bar]"]) assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]])
def test__eval_factor_categorical(): from pytest import raises from patsy.categorical import C naa = NAAction() f = _MockFactor() fi1 = FactorInfo(f, "categorical", {}, num_columns=None, categories=("a", "b")) assert fi1.factor is f cat1, _ = _eval_factor(fi1, {"mock": ["b", "a", "b"]}, naa) assert cat1.shape == (3, ) assert np.all(cat1 == [1, 0, 1]) raises(PatsyError, _eval_factor, fi1, {"mock": ["c"]}, naa) raises(PatsyError, _eval_factor, fi1, {"mock": C(["a", "c"])}, naa) raises(PatsyError, _eval_factor, fi1, {"mock": C(["a", "b"], levels=["b", "a"])}, naa) raises(PatsyError, _eval_factor, fi1, {"mock": [1, 0, 1]}, naa) bad_cat = np.asarray(["b", "a", "a", "b"]) bad_cat.resize((2, 2)) raises(PatsyError, _eval_factor, fi1, {"mock": bad_cat}, naa) cat1_NA, is_NA = _eval_factor(fi1, {"mock": ["a", None, "b"]}, NAAction(NA_types=["None"])) assert np.array_equal(is_NA, [False, True, False]) assert np.array_equal(cat1_NA, [0, -1, 1]) raises(PatsyError, _eval_factor, fi1, {"mock": ["a", None, "b"]}, NAAction(NA_types=[])) fi2 = FactorInfo(_MockFactor(), "categorical", {}, num_columns=None, categories=[False, True]) cat2, _ = _eval_factor(fi2, {"mock": [True, False, False, True]}, naa) assert cat2.shape == (4, ) assert np.all(cat2 == [1, 0, 0, 1]) if have_pandas: s = pandas.Series(["b", "a"], index=[10, 20]) cat_s, _ = _eval_factor(fi1, {"mock": s}, naa) assert isinstance(cat_s, pandas.Series) assert np.array_equal(cat_s, [1, 0]) assert np.array_equal(cat_s.index, [10, 20]) sbool = pandas.Series([True, False], index=[11, 21]) cat_sbool, _ = _eval_factor(fi2, {"mock": sbool}, naa) assert isinstance(cat_sbool, pandas.Series) assert np.array_equal(cat_sbool, [1, 0]) assert np.array_equal(cat_sbool.index, [11, 21])
def test_categorical(): data = balanced(a=2, b=2) # There are more exhaustive tests for all the different coding options in # test_build; let's just make sure that C() and stuff works. t("~ C(a)", data, 0, True, [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a)[T.a2]"]) t("~ C(a, levels=['a2', 'a1'])", data, 0, True, [[1, 1], [1, 1], [1, 0], [1, 0]], ["Intercept", "C(a, levels=['a2', 'a1'])[T.a1]"]) t("~ C(a, Treatment(reference=-1))", data, 0, True, [[1, 1], [1, 1], [1, 0], [1, 0]], ["Intercept", "C(a, Treatment(reference=-1))[T.a1]"]) # Different interactions t("a*b", data, 0, True, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]], ["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"]) t("0 + a:b", data, 0, True, [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]], ["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"]) t("1 + a + a:b", data, 0, True, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 0, 1]], ["Intercept", "a[T.a2]", "a[a1]:b[T.b2]", "a[a2]:b[T.b2]"]) # Changing contrast with C() data["a"] = C(data["a"], Helmert) t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"]) t("C(a, Treatment)", data, 0, True, [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a, Treatment)[T.a2]"]) # That didn't affect the original object t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"])
def test__CatFactorEvaluator(): from nose.tools import assert_raises from patsy.categorical import C naa = NAAction() f = _MockFactor() cf1 = _CatFactorEvaluator(f, {}, ["a", "b"]) assert cf1.factor is f cat1, _ = cf1.eval({"mock": ["b", "a", "b"]}, naa) assert cat1.shape == (3, ) assert np.all(cat1 == [1, 0, 1]) assert_raises(PatsyError, cf1.eval, {"mock": ["c"]}, naa) assert_raises(PatsyError, cf1.eval, {"mock": C(["a", "c"])}, naa) assert_raises(PatsyError, cf1.eval, {"mock": C(["a", "b"], levels=["b", "a"])}, naa) assert_raises(PatsyError, cf1.eval, {"mock": [1, 0, 1]}, naa) bad_cat = np.asarray(["b", "a", "a", "b"]) bad_cat.resize((2, 2)) assert_raises(PatsyError, cf1.eval, {"mock": bad_cat}, naa) cat1_NA, is_NA = cf1.eval({"mock": ["a", None, "b"]}, NAAction(NA_types=["None"])) assert np.array_equal(is_NA, [False, True, False]) assert np.array_equal(cat1_NA, [0, -1, 1]) assert_raises(PatsyError, cf1.eval, {"mock": ["a", None, "b"]}, NAAction(NA_types=[])) cf2 = _CatFactorEvaluator(_MockFactor(), {}, [False, True]) cat2, _ = cf2.eval({"mock": [True, False, False, True]}, naa) assert cat2.shape == (4, ) assert np.all(cat2 == [1, 0, 0, 1]) if have_pandas: s = pandas.Series(["b", "a"], index=[10, 20]) cat_s, _ = cf1.eval({"mock": s}, naa) assert isinstance(cat_s, pandas.Series) assert np.array_equal(cat_s, [1, 0]) assert np.array_equal(cat_s.index, [10, 20]) sbool = pandas.Series([True, False], index=[11, 21]) cat_sbool, _ = cf2.eval({"mock": sbool}, naa) assert isinstance(cat_sbool, pandas.Series) assert np.array_equal(cat_sbool, [1, 0]) assert np.array_equal(cat_sbool.index, [11, 21])
def test_categorical(): data_strings = {"a": ["a1", "a2", "a1"]} data_categ = {"a": C(["a2", "a1", "a2"])} datas = [data_strings, data_categ] if have_pandas_categorical: data_pandas = {"a": pandas.Categorical.from_array(["a1", "a2", "a2"])} datas.append(data_pandas) def t(data1, data2): def iter_maker(): yield data1 builders = design_matrix_builders([make_termlist(["a"])], iter_maker) build_design_matrices(builders, data2) for data1 in datas: for data2 in datas: t(data1, data2)
def test__examine_factor_types(): from patsy.categorical import C class MockFactor(object): def __init__(self): # You should check this using 'is', not '==' from patsy.origin import Origin self.origin = Origin("MOCK", 1, 2) def eval(self, state, data): return state[data] def name(self): return "MOCK MOCK" # This hacky class can only be iterated over once, but it keeps track of # how far it got. class DataIterMaker(object): def __init__(self): self.i = -1 def __call__(self): return self def __iter__(self): return self def next(self): self.i += 1 if self.i > 1: raise StopIteration return self.i __next__ = next num_1dim = MockFactor() num_1col = MockFactor() num_4col = MockFactor() categ_1col = MockFactor() bool_1col = MockFactor() string_1col = MockFactor() object_1col = MockFactor() object_levels = (object(), object(), object()) factor_states = { num_1dim: ([1, 2, 3], [4, 5, 6]), num_1col: ([[1], [2], [3]], [[4], [5], [6]]), num_4col: (np.zeros((3, 4)), np.ones((3, 4))), categ_1col: (C(["a", "b", "c"], levels=("a", "b", "c"), contrast="MOCK CONTRAST"), C(["c", "b", "a"], levels=("a", "b", "c"), contrast="MOCK CONTRAST")), bool_1col: ([True, True, False], [False, True, True]), # It has to read through all the data to see all the possible levels: string_1col: (["a", "a", "a"], ["c", "b", "a"]), object_1col: ([object_levels[0]] * 3, object_levels), } it = DataIterMaker() (num_column_counts, cat_levels_contrasts, ) = _examine_factor_types(factor_states.keys(), factor_states, it, NAAction()) assert it.i == 2 iterations = 0 assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4} assert cat_levels_contrasts == { categ_1col: (("a", "b", "c"), "MOCK CONTRAST"), bool_1col: ((False, True), None), string_1col: (("a", "b", "c"), None), object_1col: (tuple(sorted(object_levels, key=id)), None), } # Check that it doesn't read through all the data if that's not necessary: it = DataIterMaker() no_read_necessary = [num_1dim, num_1col, num_4col, categ_1col, bool_1col] (num_column_counts, cat_levels_contrasts, ) = _examine_factor_types(no_read_necessary, factor_states, it, NAAction()) assert it.i == 0 assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4} assert cat_levels_contrasts == { categ_1col: (("a", "b", "c"), "MOCK CONTRAST"), bool_1col: ((False, True), None), } # Illegal inputs: bool_3col = MockFactor() num_3dim = MockFactor() # no such thing as a multi-dimensional Categorical # categ_3dim = MockFactor() string_3col = MockFactor() object_3col = MockFactor() illegal_factor_states = { num_3dim: (np.zeros((3, 3, 3)), np.ones((3, 3, 3))), string_3col: ([["a", "b", "c"]], [["b", "c", "a"]]), object_3col: ([[[object()]]], [[[object()]]]), } import pytest for illegal_factor in illegal_factor_states: it = DataIterMaker() try: _examine_factor_types([illegal_factor], illegal_factor_states, it, NAAction()) except PatsyError as e: assert e.origin is illegal_factor.origin else: assert False
def test_categorical_to_int(): s = pd.Series(["a", "b", "c"], index=[10, 20, 30]) c_pandas = categorical_to_int(s, ("a", "b", "c"), NAAction()) assert np.all(c_pandas == [0, 1, 2]) assert np.all(c_pandas.index == [10, 20, 30]) # Input must be 1-dimensional pytest.raises(PatsyError, categorical_to_int, pd.DataFrame({10: s}), ("a", "b", "c"), NAAction()) cat = pd.Categorical([1, 0, -1], ("a", "b")) conv = categorical_to_int(cat, ("a", "b"), NAAction()) assert np.all(conv == [1, 0, -1]) # Trust pandas NA marking cat2 = pd.Categorical([1, 0, -1], ("a", "None")) conv2 = categorical_to_int(cat, ("a", "b"), NAAction(NA_types=["None"])) assert np.all(conv2 == [1, 0, -1]) # But levels must match pytest.raises(PatsyError, categorical_to_int, pd.Categorical([1, 0], ("a", "b")), ("a", "c"), NAAction()) pytest.raises(PatsyError, categorical_to_int, pd.Categorical([1, 0], ("a", "b")), ("b", "a"), NAAction()) def t(data, levels, expected, NA_action=NAAction()): got = categorical_to_int(data, levels, NA_action) assert np.array_equal(got, expected) t(["a", "b", "a"], ("a", "b"), [0, 1, 0]) t(np.asarray(["a", "b", "a"]), ("a", "b"), [0, 1, 0]) t(np.asarray(["a", "b", "a"], dtype=object), ("a", "b"), [0, 1, 0]) t([0, 1, 2], (1, 2, 0), [2, 0, 1]) t(np.asarray([0, 1, 2]), (1, 2, 0), [2, 0, 1]) t(np.asarray([0, 1, 2], dtype=float), (1, 2, 0), [2, 0, 1]) t(np.asarray([0, 1, 2], dtype=object), (1, 2, 0), [2, 0, 1]) t(["a", "b", "a"], ("a", "d", "z", "b"), [0, 3, 0]) t([("a", 1), ("b", 0), ("a", 1)], (("a", 1), ("b", 0)), [0, 1, 0]) pytest.raises(PatsyError, categorical_to_int, ["a", "b", "a"], ("a", "c"), NAAction()) t(C(["a", "b", "a"]), ("a", "b"), [0, 1, 0]) t(C(["a", "b", "a"]), ("b", "a"), [1, 0, 1]) t(C(["a", "b", "a"], levels=["b", "a"]), ("b", "a"), [1, 0, 1]) # Mismatch between C() levels and expected levels pytest.raises(PatsyError, categorical_to_int, C(["a", "b", "a"], levels=["a", "b"]), ("b", "a"), NAAction()) # ndim == 2 is disallowed pytest.raises(PatsyError, categorical_to_int, np.asarray([["a", "b"], ["b", "a"]]), ("a", "b"), NAAction()) # ndim == 0 is disallowed likewise pytest.raises(PatsyError, categorical_to_int, "a", ("a", "b"), NAAction()) # levels must be hashable pytest.raises(PatsyError, categorical_to_int, ["a", "b"], ("a", "b", {}), NAAction()) pytest.raises(PatsyError, categorical_to_int, ["a", "b", {}], ("a", "b"), NAAction()) t(["b", None, np.nan, "a"], ("a", "b"), [1, -1, -1, 0], NAAction(NA_types=["None", "NaN"])) t(["b", None, np.nan, "a"], ("a", "b", None), [1, -1, -1, 0], NAAction(NA_types=["None", "NaN"])) t(["b", None, np.nan, "a"], ("a", "b", None), [1, 2, -1, 0], NAAction(NA_types=["NaN"])) # Smoke test for the branch that formats the ellipsized list of levels in # the error message: pytest.raises(PatsyError, categorical_to_int, ["a", "b", "q"], ("a", "b", "c", "d", "e", "f", "g", "h"), NAAction())
def test_CategoricalSniffer(): patch_patsy() from patsy.categorical import CategoricalSniffer def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None): sniffer = CategoricalSniffer(NAAction(NA_types=NA_types)) for data in datas: done = sniffer.sniff(data) if done: assert exp_finish_fast break else: assert not exp_finish_fast assert sniffer.levels_contrast() == (exp_levels, exp_contrast) t([], [pd.Categorical.from_array([1, 2, None])], True, (1, 2)) # check order preservation t([], [pd.Categorical([1, 0], ["a", "b"])], True, ("a", "b")) t([], [pd.Categorical([1, 0], ["b", "a"])], True, ("b", "a")) # check that if someone sticks a .contrast field onto a Categorical # object, we pick it up: c = pd.Categorical.from_array(["a", "b"]) c.contrast = "CONTRAST" t([], [c], True, ("a", "b"), "CONTRAST") t([], [C([1, 2]), C([3, 2])], False, (1, 2, 3)) # check order preservation t([], [C([1, 2], levels=[1, 2, 3]), C([4, 2])], True, (1, 2, 3)) t([], [C([1, 2], levels=[3, 2, 1]), C([4, 2])], True, (3, 2, 1)) # do some actual sniffing with NAs in t(["None", "NaN"], [C([1, np.nan]), C([10, None])], False, (1, 10)) # But 'None' can be a type if we don't make it represent NA: sniffer = CategoricalSniffer(NAAction(NA_types=["NaN"])) sniffer.sniff(C([1, np.nan, None])) # The level order here is different on py2 and py3 :-( Because there's no # consistent way to sort mixed-type values on both py2 and py3. Honestly # people probably shouldn't use this, but I don't know how to give a # sensible error. levels, _ = sniffer.levels_contrast() assert set(levels) == set([None, 1]) # bool special case t(["None", "NaN"], [C([True, np.nan, None])], True, (False, True)) t([], [C([10, 20]), C([False]), C([30, 40])], False, (False, True, 10, 20, 30, 40)) # check tuples too t(["None", "NaN"], [C([("b", 2), None, ("a", 1), np.nan, ("c", None)])], False, (("a", 1), ("b", 2), ("c", None))) # contrasts t([], [C([10, 20], contrast="FOO")], False, (10, 20), "FOO") # unhashable level error: sniffer = CategoricalSniffer(NAAction()) pytest.raises(PatsyError, sniffer.sniff, [{}])
def eval(self, memorize_state, data): value = data[self._varname] if self._force_categorical: value = C(value, contrast=self._contrast, levels=self._levels) return value