예제 #1
0
def _generate_rhs(contrasts, columns_var_gt_0) -> list[Term]:
    rhs = [Term([])]  # force intercept
    for contrast in contrasts:
        if contrast["type"] == "infer":
            if not columns_var_gt_0[contrast["variable"]].all():
                logger.warning(
                    f'Not adding term "{contrast["variable"]}" to design matrix '
                    "because it has zero variance")
                continue
            # for every term in the model a contrast of type infer needs to be specified
            rhs.append(
                Term([LookupFactor(name) for name in contrast["variable"]]))

    return rhs
예제 #2
0
def test_pick_contrasts_for_term():
    from patsy.desc import Term
    used = set()
    codings = pick_contrasts_for_term(Term([]), set(), used)
    assert codings == [{}]
    codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used)
    assert codings == [{"a": False}]
    codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used)
    assert codings == [{"a": True, "b": False}]
    used_snapshot = set(used)
    codings = pick_contrasts_for_term(Term(["c", "d"]), set(), used)
    assert codings == [{"d": False}, {"c": False, "d": True}]
    # Do it again backwards, to make sure we're deterministic with respect to
    # order:
    codings = pick_contrasts_for_term(Term(["d", "c"]), set(), used_snapshot)
    assert codings == [{"c": False}, {"c": True, "d": False}]
예제 #3
0
def test_DesignMatrixBuilder_subset():
    # For each combination of:
    #   formula, term names, term objects, mixed term name and term objects
    # check that results match subset of full build
    # and that removed variables don't hurt
    all_data = {"x": [1, 2],
                "y": [[3.1, 3.2],
                      [4.1, 4.2]],
                "z": [5, 6]}
    all_terms = make_termlist("x", "y", "z")
    def iter_maker():
        yield all_data
    all_builder = design_matrix_builders([all_terms], iter_maker)[0]
    full_matrix = build_design_matrices([all_builder], all_data)[0]

    def t(which_terms, variables, columns):
        sub_builder = all_builder.subset(which_terms)
        sub_data = {}
        for variable in variables:
            sub_data[variable] = all_data[variable]
        sub_matrix = build_design_matrices([sub_builder], sub_data)[0]
        sub_full_matrix = full_matrix[:, columns]
        if not isinstance(which_terms, six.string_types):
            assert len(which_terms) == len(sub_builder.design_info.terms)
        assert np.array_equal(sub_matrix, sub_full_matrix)

    t("~ 0 + x + y + z", ["x", "y", "z"], slice(None))
    t(["x", "y", "z"], ["x", "y", "z"], slice(None))
    if six.PY2:
        t([unicode("x"), unicode("y"), unicode("z")],
          ["x", "y", "z"], slice(None))
    t(all_terms, ["x", "y", "z"], slice(None))
    t([all_terms[0], "y", all_terms[2]], ["x", "y", "z"], slice(None))

    t("~ 0 + x + z", ["x", "z"], [0, 3])
    t(["x", "z"], ["x", "z"], [0, 3])
    if six.PY2:
        t([unicode("x"), unicode("z")], ["x", "z"], [0, 3])
    t([all_terms[0], all_terms[2]], ["x", "z"], [0, 3])
    t([all_terms[0], "z"], ["x", "z"], [0, 3])

    t("~ 0 + z + x", ["x", "z"], [3, 0])
    t(["z", "x"], ["x", "z"], [3, 0])
    t([six.text_type("z"), six.text_type("x")], ["x", "z"], [3, 0])
    t([all_terms[2], all_terms[0]], ["x", "z"], [3, 0])
    t([all_terms[2], "x"], ["x", "z"], [3, 0])

    t("~ 0 + y", ["y"], [1, 2])
    t(["y"], ["y"], [1, 2])
    t([six.text_type("y")], ["y"], [1, 2])
    t([all_terms[1]], ["y"], [1, 2])

    # Formula can't have a LHS
    assert_raises(PatsyError, all_builder.subset, "a ~ a")
    # Term must exist
    assert_raises(PatsyError, all_builder.subset, "~ asdf")
    assert_raises(PatsyError, all_builder.subset, ["asdf"])
    assert_raises(PatsyError,
                  all_builder.subset, [Term(["asdf"])])
예제 #4
0
파일: utils.py 프로젝트: USC-CSSL/NTAP
def parse_formula(f_str):

    patsy_formula = ModelDesc.from_formula(f_str)

    tokenize = patsy_formula.lhs_termlist

    valid_tokenizers = list()
    for term in tokenize:
        for e in term.factors:
            code = e.code
            if code in _VALID_TOKENIZERS:
                valid_tokenizers.append(code)

    if len(valid_tokenizers) == 0:
        tokenize.insert(0, Term([EvalFactor(_DEFAULT_TOKENIZER)]))
    if len(valid_tokenizers) > 1:
        raise RuntimeError("Multiple tokenizers found in formula\n"
                           f"Specify one from {' '.join(_VALID_TOKENIZERS)}")

    preprocess = [t for t in patsy_formula.rhs_termlist if len(t.factors) > 0]
    return tokenize, preprocess
예제 #5
0
 def __patsy_get_model_desc__(self, data):
     return ModelDesc([Term([LookupFactor("Y")])],
                      [Term([LookupFactor("X")])])
예제 #6
0
def test_formula_likes():
    # Plain array-like, rhs only
    t([[1, 2, 3], [4, 5, 6]], {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    t(dm, {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"])
    t((None, dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["foo0", "foo1", "foo2"])

    # Plain array-likes, lhs and rhs
    t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0,
      False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    y_dm = DesignMatrix([1, 2], default_column_prefix="bar")
    t((y_dm, x_dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["foo0", "foo1", "foo2"], [[1], [2]], ["bar0"])
    # number of rows must match
    t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0)

    # tuples must have the right size
    t_invalid(([[1, 2, 3]], ), {}, 0)
    t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0)

    # plain Series and DataFrames
    if have_pandas:
        # Names are extracted
        t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, False, [[1], [2], [3]],
          ["x"])
        t(pandas.Series([1, 2, 3], name="asdf"), {}, 0, False, [[1], [2], [3]],
          ["asdf"])
        t((pandas.DataFrame({"y": [4, 5, 6]
                             }), pandas.DataFrame({"x": [1, 2, 3]})), {}, 0,
          False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"])
        t((pandas.Series([4, 5, 6],
                         name="y"), pandas.Series([1, 2, 3], name="x")), {}, 0,
          False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"])
        # Or invented
        t((pandas.DataFrame([[4, 5, 6]]),
           pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0, False,
          [[1, 2, 3]], ["x7", "x8", "x9"], [[4, 5, 6]], ["y0", "y1", "y2"])
        t(pandas.Series([1, 2, 3]), {}, 0, False, [[1], [2], [3]], ["x0"])
        # indices must match
        t_invalid((pandas.DataFrame(
            [[1]], index=[1]), pandas.DataFrame([[1]], index=[2])), {}, 0)

    # Foreign ModelDesc factories
    class ForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return ModelDesc([Term([LookupFactor("Y")])],
                             [Term([LookupFactor("X")])])

    foreign_model = ForeignModelSource()
    t(foreign_model, {
        "Y": [1, 2],
        "X": [[1, 2], [3, 4]]
    }, 0, True, [[1, 2], [3, 4]], ["X[0]", "X[1]"], [[1], [2]], ["Y"])

    class BadForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return data

    t_invalid(BadForeignModelSource(), {}, 0)

    # string formulas
    t("y ~ x", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"], [[1], [2]], ["y"])
    t("~ x", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"])
    t("x + y", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"])

    # ModelDesc
    desc = ModelDesc([], [Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1.5], [2.5], [3.5]], ["x"])
    desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]],
      ["Intercept", "x"])
    desc = ModelDesc([Term([LookupFactor("y")])],
                     [Term([]), Term([LookupFactor("x")])])
    t(desc, {
        "x": [1.5, 2.5, 3.5],
        "y": [10, 20, 30]
    }, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"],
      [[10], [20], [30]], ["y"])

    # builders
    termlists = (
        [],
        [Term([LookupFactor("x")])],
        [Term([]), Term([LookupFactor("x")])],
    )
    builders = design_matrix_builders(termlists, lambda: iter([{
        "x": [1, 2, 3]
    }]))
    # twople but with no LHS
    t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"])
    # single DesignMatrixBuilder
    t(builders[2], {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]],
      ["Intercept", "x"])
    # twople with LHS
    t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"], [[10], [20], [30]],
      ["x"])

    # check depth arguments
    x_in_env = [1, 2, 3]
    t("~ x_in_env", {}, 0, True, [[1, 1], [1, 2], [1, 3]],
      ["Intercept", "x_in_env"])
    t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"])
    # Trying to pull x_in_env out of our *caller* shouldn't work.
    t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError))

    # But then again it should, if called from one down on the stack:
    def check_nested_call():
        x_in_env = "asdf"
        t("~ x_in_env", {}, 1, True, [[1, 1], [1, 2], [1, 3]],
          ["Intercept", "x_in_env"])

    check_nested_call()
    # passing in an explicit EvalEnvironment also works:
    e = EvalEnvironment.capture(1)
    t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError))
    e = EvalEnvironment.capture(0)

    def check_nested_call_2():
        x_in_env = "asdf"
        t("~ x_in_env", {}, e, True, [[1, 1], [1, 2], [1, 3]],
          ["Intercept", "x_in_env"])

    check_nested_call_2()
예제 #7
0
def test_DesignInfo_subset():
    # For each combination of:
    #   formula, term names, term objects, mixed term name and term objects
    # check that results match subset of full build
    # and that removed variables don't hurt
    all_data = {"x": [1, 2], "y": [[3.1, 3.2], [4.1, 4.2]], "z": [5, 6]}
    all_terms = make_termlist("x", "y", "z")

    def iter_maker():
        yield all_data

    all_builder = design_matrix_builders([all_terms], iter_maker, 0)[0]
    full_matrix = build_design_matrices([all_builder], all_data)[0]

    def t(which_terms, variables, columns):
        sub_design_info = all_builder.subset(which_terms)
        sub_data = {}
        for variable in variables:
            sub_data[variable] = all_data[variable]
        sub_matrix = build_design_matrices([sub_design_info], sub_data)[0]
        sub_full_matrix = full_matrix[:, columns]
        if not isinstance(which_terms, six.string_types):
            assert len(which_terms) == len(sub_design_info.terms)
        assert np.array_equal(sub_matrix, sub_full_matrix)

    t("~ 0 + x + y + z", ["x", "y", "z"], slice(None))
    t(["x", "y", "z"], ["x", "y", "z"], slice(None))
    # Compatibility: six.PY2 wasn't added until 1.4.0, but six.PY3 exists in
    # all versions.
    if not six.PY3:
        t([unicode("x"), unicode("y"),
           unicode("z")], ["x", "y", "z"], slice(None))
    t(all_terms, ["x", "y", "z"], slice(None))
    t([all_terms[0], "y", all_terms[2]], ["x", "y", "z"], slice(None))

    t("~ 0 + x + z", ["x", "z"], [0, 3])
    t(["x", "z"], ["x", "z"], [0, 3])
    # Compatibility: six.PY2 wasn't added until 1.4.0, but six.PY3 exists in
    # all versions.
    if not six.PY3:
        t([unicode("x"), unicode("z")], ["x", "z"], [0, 3])
    t([all_terms[0], all_terms[2]], ["x", "z"], [0, 3])
    t([all_terms[0], "z"], ["x", "z"], [0, 3])

    t("~ 0 + z + x", ["x", "z"], [3, 0])
    t(["z", "x"], ["x", "z"], [3, 0])
    t([six.text_type("z"), six.text_type("x")], ["x", "z"], [3, 0])
    t([all_terms[2], all_terms[0]], ["x", "z"], [3, 0])
    t([all_terms[2], "x"], ["x", "z"], [3, 0])

    t("~ 0 + y", ["y"], [1, 2])
    t(["y"], ["y"], [1, 2])
    t([six.text_type("y")], ["y"], [1, 2])
    t([all_terms[1]], ["y"], [1, 2])

    # Formula can't have a LHS
    pytest.raises(PatsyError, all_builder.subset, "a ~ a")
    # Term must exist
    pytest.raises(KeyError, all_builder.subset, "~ asdf")
    pytest.raises(KeyError, all_builder.subset, ["asdf"])
    pytest.raises(KeyError, all_builder.subset, [Term(["asdf"])])

    # Also check for a minimal DesignInfo (column names only)
    min_di = DesignInfo(["a", "b", "c"])
    min_di_subset = min_di.subset(["c", "a"])
    assert min_di_subset.column_names == ["c", "a"]
    assert min_di_subset.terms is None
예제 #8
0
def make_termlist(*entries):
    terms = []
    for entry in entries:
        terms.append(Term([LookupFactor(name) for name in entry]))
    return terms
예제 #9
0
def test_DesignInfo():
    from nose.tools import assert_raises
    class _MockFactor(object):
        def __init__(self, name):
            self._name = name

        def name(self):
            return self._name
    f_x = _MockFactor("x")
    f_y = _MockFactor("y")
    t_x = Term([f_x])
    t_y = Term([f_y])
    factor_infos = {f_x:
                      FactorInfo(f_x, "numerical", {}, num_columns=3),
                    f_y:
                      FactorInfo(f_y, "numerical", {}, num_columns=1),
                   }
    term_codings = OrderedDict([(t_x, [SubtermInfo([f_x], {}, 3)]),
                                (t_y, [SubtermInfo([f_y], {}, 1)])])
    di = DesignInfo(["x1", "x2", "x3", "y"], factor_infos, term_codings)
    assert di.column_names == ["x1", "x2", "x3", "y"]
    assert di.term_names == ["x", "y"]
    assert di.terms == [t_x, t_y]
    assert di.column_name_indexes == {"x1": 0, "x2": 1, "x3": 2, "y": 3}
    assert di.term_name_slices == {"x": slice(0, 3), "y": slice(3, 4)}
    assert di.term_slices == {t_x: slice(0, 3), t_y: slice(3, 4)}
    assert di.describe() == "x + y"

    assert di.slice(1) == slice(1, 2)
    assert di.slice("x1") == slice(0, 1)
    assert di.slice("x2") == slice(1, 2)
    assert di.slice("x3") == slice(2, 3)
    assert di.slice("x") == slice(0, 3)
    assert di.slice(t_x) == slice(0, 3)
    assert di.slice("y") == slice(3, 4)
    assert di.slice(t_y) == slice(3, 4)
    assert di.slice(slice(2, 4)) == slice(2, 4)
    assert_raises(PatsyError, di.slice, "asdf")

    # smoke test
    repr(di)

    assert_no_pickling(di)

    # One without term objects
    di = DesignInfo(["a1", "a2", "a3", "b"])
    assert di.column_names == ["a1", "a2", "a3", "b"]
    assert di.term_names == ["a1", "a2", "a3", "b"]
    assert di.terms is None
    assert di.column_name_indexes == {"a1": 0, "a2": 1, "a3": 2, "b": 3}
    assert di.term_name_slices == {"a1": slice(0, 1),
                                   "a2": slice(1, 2),
                                   "a3": slice(2, 3),
                                   "b": slice(3, 4)}
    assert di.term_slices is None
    assert di.describe() == "a1 + a2 + a3 + b"

    assert di.slice(1) == slice(1, 2)
    assert di.slice("a1") == slice(0, 1)
    assert di.slice("a2") == slice(1, 2)
    assert di.slice("a3") == slice(2, 3)
    assert di.slice("b") == slice(3, 4)

    # Check intercept handling in describe()
    assert DesignInfo(["Intercept", "a", "b"]).describe() == "1 + a + b"

    # Failure modes
    # must specify either both or neither of factor_infos and term_codings:
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y"], factor_infos=factor_infos)
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y"], term_codings=term_codings)
    # factor_infos must be a dict
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y"], list(factor_infos), term_codings)
    # wrong number of column names:
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y1", "y2"], factor_infos, term_codings)
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3"], factor_infos, term_codings)
    # name overlap problems
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "y", "y2"], factor_infos, term_codings)
    # duplicate name
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x1", "x1", "y"], factor_infos, term_codings)

    # f_y is in factor_infos, but not mentioned in any term
    term_codings_x_only = OrderedDict(term_codings)
    del term_codings_x_only[t_y]
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3"], factor_infos, term_codings_x_only)

    # f_a is in a term, but not in factor_infos
    f_a = _MockFactor("a")
    t_a = Term([f_a])
    term_codings_with_a = OrderedDict(term_codings)
    term_codings_with_a[t_a] = [SubtermInfo([f_a], {}, 1)]
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y", "a"],
                  factor_infos, term_codings_with_a)

    # bad factor_infos
    not_factor_infos = dict(factor_infos)
    not_factor_infos[f_x] = "what is this I don't even"
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y"], not_factor_infos, term_codings)

    mismatch_factor_infos = dict(factor_infos)
    mismatch_factor_infos[f_x] = FactorInfo(f_a, "numerical", {}, num_columns=3)
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y"], mismatch_factor_infos, term_codings)

    # bad term_codings
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y"], factor_infos, dict(term_codings))

    not_term_codings = OrderedDict(term_codings)
    not_term_codings["this is a string"] = term_codings[t_x]
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y"], factor_infos, not_term_codings)

    non_list_term_codings = OrderedDict(term_codings)
    non_list_term_codings[t_y] = tuple(term_codings[t_y])
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y"], factor_infos, non_list_term_codings)

    non_subterm_term_codings = OrderedDict(term_codings)
    non_subterm_term_codings[t_y][0] = "not a SubtermInfo"
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y"], factor_infos, non_subterm_term_codings)

    bad_subterm = OrderedDict(term_codings)
    # f_x is a factor in this model, but it is not a factor in t_y
    term_codings[t_y][0] = SubtermInfo([f_x], {}, 1)
    assert_raises(ValueError, DesignInfo,
                  ["x1", "x2", "x3", "y"], factor_infos, bad_subterm)

    # contrast matrix has wrong number of rows
    factor_codings_a = {f_a:
                          FactorInfo(f_a, "categorical", {},
                                     categories=["a1", "a2"])}
    term_codings_a_bad_rows = OrderedDict([
        (t_a,
         [SubtermInfo([f_a],
                      {f_a: ContrastMatrix(np.ones((3, 2)),
                                           ["[1]", "[2]"])},
                      2)])])
    assert_raises(ValueError, DesignInfo,
                  ["a[1]", "a[2]"],
                  factor_codings_a,
                  term_codings_a_bad_rows)

    # have a contrast matrix for a non-categorical factor
    t_ax = Term([f_a, f_x])
    factor_codings_ax = {f_a:
                           FactorInfo(f_a, "categorical", {},
                                      categories=["a1", "a2"]),
                         f_x:
                           FactorInfo(f_x, "numerical", {},
                                      num_columns=2)}
    term_codings_ax_extra_cm = OrderedDict([
        (t_ax,
         [SubtermInfo([f_a, f_x],
                      {f_a: ContrastMatrix(np.ones((2, 2)), ["[1]", "[2]"]),
                       f_x: ContrastMatrix(np.ones((2, 2)), ["[1]", "[2]"])},
                      4)])])
    assert_raises(ValueError, DesignInfo,
                  ["a[1]:x[1]", "a[2]:x[1]", "a[1]:x[2]", "a[2]:x[2]"],
                  factor_codings_ax,
                  term_codings_ax_extra_cm)

    # no contrast matrix for a categorical factor
    term_codings_ax_missing_cm = OrderedDict([
        (t_ax,
         [SubtermInfo([f_a, f_x],
                      {},
                      4)])])
    # This actually fails before it hits the relevant check with a KeyError,
    # but that's okay... the previous test still exercises the check.
    assert_raises((ValueError, KeyError), DesignInfo,
                  ["a[1]:x[1]", "a[2]:x[1]", "a[1]:x[2]", "a[2]:x[2]"],
                  factor_codings_ax,
                  term_codings_ax_missing_cm)

    # subterm num_columns doesn't match the value computed from the individual
    # factors
    term_codings_ax_wrong_subterm_columns = OrderedDict([
        (t_ax,
         [SubtermInfo([f_a, f_x],
                      {f_a: ContrastMatrix(np.ones((2, 3)),
                                           ["[1]", "[2]", "[3]"])},
                      # should be 2 * 3 = 6
                      5)])])
    assert_raises(ValueError, DesignInfo,
                  ["a[1]:x[1]", "a[2]:x[1]", "a[3]:x[1]",
                   "a[1]:x[2]", "a[2]:x[2]", "a[3]:x[2]"],
                  factor_codings_ax,
                  term_codings_ax_wrong_subterm_columns)
print(data[["Label", "f1", "f2", data.columns[-1]]].head())

###################################################
# Let's train a logistic regression.

formula = "Label ~ {0}".format(" + ".join(data.columns[1:]))
print(formula[:50] + " + ...")

from microsoftml import rx_logistic_regression

try:
    logregml = rx_logistic_regression(formula, data=data)
except Exception as e:
    # The error is expected because patsy cannot handle
    # so many features.
    print(e)

#########################################
# Let's skip patsy's parser to manually define the formula
# with object `ModelDesc <http://patsy.readthedocs.io/en/latest/API-reference.html?highlight=lookupfactor#patsy.ModelDesc>`_.

from patsy.desc import ModelDesc, Term
from patsy.user_util import LookupFactor

patsy_features = [Term([LookupFactor(n)]) for n in data.columns[1:]][:10]
model_formula = ModelDesc([Term([LookupFactor("Label")])], [Term([])] + patsy_features)

print(model_formula.describe() + " + ...")
logregml = rx_logistic_regression(model_formula, data=data)