Пример #1
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env,
                       NA_action):
    if isinstance(formula_like, DesignInfo):
        return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0],
                formula_like)
    if (isinstance(formula_like, tuple)
        and len(formula_like) == 2
        and isinstance(formula_like[0], DesignInfo)
        and isinstance(formula_like[1], DesignInfo)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__"
                                % (formula_like,))
        # fallthrough
    if isinstance(formula_like, str):
        formula_like = ModelDesc.from_formula(formula_like)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        assert isinstance(eval_env, EvalEnvironment)
        return design_matrix_builders([formula_like.lhs_termlist,
                                       formula_like.rhs_termlist],
                                      data_iter_maker,
                                      eval_env,
                                      NA_action)
    else:
        return None
Пример #2
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env):
    if isinstance(formula_like, DesignMatrixBuilder):
        return (design_matrix_builders([[]], data_iter_maker)[0],
                formula_like)
    if (isinstance(formula_like, tuple)
        and len(formula_like) == 2
        and isinstance(formula_like[0], DesignMatrixBuilder)
        and isinstance(formula_like[1], DesignMatrixBuilder)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__"
                                % (formula_like,))
        # fallthrough
    if isinstance(formula_like, basestring):
        eval_env = _get_env(eval_env)
        formula_like = ModelDesc.from_formula(formula_like, eval_env)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        return design_matrix_builders([formula_like.lhs_termlist,
                                       formula_like.rhs_termlist],
                                      data_iter_maker)
    else:
        return None
Пример #3
0
def eval_bar(evaluator, tree):
    """Evaluation function for the bar operator AST node."""

    assert len(tree.args) == 2
    expr_node, factor_node = tree.args

    # create model description for the expression left of the bar
    expr_node = ParseNode("~", None, [expr_node], expr_node.origin)
    expr_md = ModelDesc.from_formula(expr_node)

    # create model description for grouping factor right of the bar
    factor_node = ParseNode("~", None, [factor_node], factor_node.origin)
    factor_md = ModelDesc.from_formula(factor_node)
    factor_md.rhs_termlist.remove(INTERCEPT)

    # combine these in a random effects term
    ret = RandomEffectsTerm(expr=expr_md, factor=factor_md)

    # return corresponding intermediate expression
    return IntermediateExpr(False, None, False, [ret])
Пример #4
0
def parse_formula(form):
    # use patsy for formula parse
    desc = ModelDesc.from_formula(form)

    # convert to string lists
    y_terms = [parse_term(t) for t in desc.lhs_termlist]
    x_terms = [parse_term(t) for t in desc.rhs_termlist]
    x_class = [classify_term(t) for t in x_terms]

    # separate into components
    y = squeeze_term(y_terms[0])
    x = [squeeze_term(t) for t, c in zip(x_terms, x_class) if c == 'continuous']
    fe = [squeeze_term(strip_cat(t)) for t, c in zip(x_terms, x_class) if c == 'categorical']
    intercept = any([c == 'intercept' for c in x_class])

    return y, x, fe, intercept
Пример #5
0
def get_matrices(data, formula, env=0):
    """Given the data and a formula, build Z and X matrices."""
    model_description = evaluate_formula(formula)

    fixef_terms, randef_terms = [], []
    for term in model_description.rhs_termlist:
        if isinstance(term, RandomEffectsTerm):
            randef_terms.append(term)
        else:
            fixef_terms.append(term)

    Zis = []
    Lambdatis = []
    thetais = []
    ps = []
    ls = []
    for ret in randef_terms:
        X = dmatrix(ret.expr, data, env)
        J = dmatrix(ret.factor, data, env)
        _, p = X.shape
        _, l = J.shape
        ps.append(p)
        ls.append(l)
        Zis.append(buildzi(X, J))
        Lambdati, thetai = buildlambdati(p, l)
        Lambdatis.append(Lambdati)
        thetais.append(thetai)

    Lind = buildlind(ps, ls)

    def thfun(theta):
        return theta[Lind]

    Z = hstack(Zis).T
    Lambdat = block_diag(Lambdatis, format='csc')

    y, X = dmatrices(ModelDesc(model_description.lhs_termlist, fixef_terms),
                     data)

    y = np.asarray(y)
    X = np.asarray(X)

    # initial value of theta
    theta0 = np.concatenate(thetais)

    return X, Z, Lambdat, y, theta0, thfun
Пример #6
0
def parse_formula(form):
    try:
        from patsy.desc import ModelDesc
    except:
        print('Please install patsy for formula parsing')
        return

    # use patsy for formula parse
    desc = ModelDesc.from_formula(form)
    lhs, rhs = desc.lhs_termlist, desc.rhs_termlist

    # convert to string lists
    x_terms = Formula(*[parse_term(t) for t in rhs])
    if len(lhs) > 0:
        y_terms = parse_factor(lhs[0].factors[0])
        return y_terms, x_terms
    else:
        return x_terms
Пример #7
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env,
                       NA_action):
    if isinstance(formula_like, DesignInfo):
        return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0],
                formula_like)
    if (isinstance(formula_like, tuple)
        and len(formula_like) == 2
        and isinstance(formula_like[0], DesignInfo)
        and isinstance(formula_like[1], DesignInfo)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__"
                                % (formula_like,))
        # fallthrough
    if not six.PY3 and isinstance(formula_like, unicode):
        # Included for the convenience of people who are using py2 with
        # __future__.unicode_literals.
        try:
            formula_like = formula_like.encode("ascii")
        except UnicodeEncodeError:
            raise PatsyError(
                "On Python 2, formula strings must be either 'str' objects, "
                "or else 'unicode' objects containing only ascii "
                "characters. You passed a unicode string with non-ascii "
                "characters. I'm afraid you'll have to either switch to "
                "ascii-only, or else upgrade to Python 3.")
    if isinstance(formula_like, str):
        formula_like = ModelDesc.from_formula(formula_like)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        assert isinstance(eval_env, EvalEnvironment)
        return design_matrix_builders([formula_like.lhs_termlist,
                                       formula_like.rhs_termlist],
                                      data_iter_maker,
                                      eval_env,
                                      NA_action)
    else:
        return None
Пример #8
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action):
    if isinstance(formula_like, DesignInfo):
        return (design_matrix_builders([[]], data_iter_maker, eval_env,
                                       NA_action)[0], formula_like)
    if (isinstance(formula_like, tuple) and len(formula_like) == 2
            and isinstance(formula_like[0], DesignInfo)
            and isinstance(formula_like[1], DesignInfo)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__" %
                             (formula_like, ))
        # fallthrough
    if not six.PY3 and isinstance(formula_like, unicode):
        # Included for the convenience of people who are using py2 with
        # __future__.unicode_literals.
        try:
            formula_like = formula_like.encode("ascii")
        except UnicodeEncodeError:
            raise PatsyError(
                "On Python 2, formula strings must be either 'str' objects, "
                "or else 'unicode' objects containing only ascii "
                "characters. You passed a unicode string with non-ascii "
                "characters. I'm afraid you'll have to either switch to "
                "ascii-only, or else upgrade to Python 3.")
    if isinstance(formula_like, str):
        formula_like = ModelDesc.from_formula(formula_like)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        assert isinstance(eval_env, EvalEnvironment)
        return design_matrix_builders(
            [formula_like.lhs_termlist, formula_like.rhs_termlist],
            data_iter_maker, eval_env, NA_action)
    else:
        return None
Пример #9
0
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action):
    if isinstance(formula_like, DesignMatrixBuilder):
        return (design_matrix_builders([[]], data_iter_maker,
                                       NA_action)[0], formula_like)
    if (isinstance(formula_like, tuple) and len(formula_like) == 2
            and isinstance(formula_like[0], DesignMatrixBuilder)
            and isinstance(formula_like[1], DesignMatrixBuilder)):
        return formula_like
    if hasattr(formula_like, "__patsy_get_model_desc__"):
        formula_like = formula_like.__patsy_get_model_desc__(eval_env)
        if not isinstance(formula_like, ModelDesc):
            raise PatsyError("bad value from %r.__patsy_get_model_desc__" %
                             (formula_like, ))
        # fallthrough
    if isinstance(formula_like, basestring):
        assert isinstance(eval_env, EvalEnvironment)
        formula_like = ModelDesc.from_formula(formula_like, eval_env)
        # fallthrough
    if isinstance(formula_like, ModelDesc):
        return design_matrix_builders(
            [formula_like.lhs_termlist, formula_like.rhs_termlist],
            data_iter_maker, NA_action)
    else:
        return None
Пример #10
0
 def __patsy_get_model_desc__(self, data):
     return ModelDesc([Term([LookupFactor("Y")])],
                      [Term([LookupFactor("X")])])
Пример #11
0
def test_formula_likes():
    # Plain array-like, rhs only
    t([[1, 2, 3], [4, 5, 6]], {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    t(dm, {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"])
    t((None, dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["foo0", "foo1", "foo2"])

    # Plain array-likes, lhs and rhs
    t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0,
      False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    y_dm = DesignMatrix([1, 2], default_column_prefix="bar")
    t((y_dm, x_dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["foo0", "foo1", "foo2"], [[1], [2]], ["bar0"])
    # number of rows must match
    t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0)

    # tuples must have the right size
    t_invalid(([[1, 2, 3]], ), {}, 0)
    t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0)

    # plain Series and DataFrames
    if have_pandas:
        # Names are extracted
        t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, False, [[1], [2], [3]],
          ["x"])
        t(pandas.Series([1, 2, 3], name="asdf"), {}, 0, False, [[1], [2], [3]],
          ["asdf"])
        t((pandas.DataFrame({"y": [4, 5, 6]
                             }), pandas.DataFrame({"x": [1, 2, 3]})), {}, 0,
          False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"])
        t((pandas.Series([4, 5, 6],
                         name="y"), pandas.Series([1, 2, 3], name="x")), {}, 0,
          False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"])
        # Or invented
        t((pandas.DataFrame([[4, 5, 6]]),
           pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0, False,
          [[1, 2, 3]], ["x7", "x8", "x9"], [[4, 5, 6]], ["y0", "y1", "y2"])
        t(pandas.Series([1, 2, 3]), {}, 0, False, [[1], [2], [3]], ["x0"])
        # indices must match
        t_invalid((pandas.DataFrame(
            [[1]], index=[1]), pandas.DataFrame([[1]], index=[2])), {}, 0)

    # Foreign ModelDesc factories
    class ForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return ModelDesc([Term([LookupFactor("Y")])],
                             [Term([LookupFactor("X")])])

    foreign_model = ForeignModelSource()
    t(foreign_model, {
        "Y": [1, 2],
        "X": [[1, 2], [3, 4]]
    }, 0, True, [[1, 2], [3, 4]], ["X[0]", "X[1]"], [[1], [2]], ["Y"])

    class BadForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return data

    t_invalid(BadForeignModelSource(), {}, 0)

    # string formulas
    t("y ~ x", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"], [[1], [2]], ["y"])
    t("~ x", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"])
    t("x + y", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"])

    # ModelDesc
    desc = ModelDesc([], [Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1.5], [2.5], [3.5]], ["x"])
    desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]],
      ["Intercept", "x"])
    desc = ModelDesc([Term([LookupFactor("y")])],
                     [Term([]), Term([LookupFactor("x")])])
    t(desc, {
        "x": [1.5, 2.5, 3.5],
        "y": [10, 20, 30]
    }, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"],
      [[10], [20], [30]], ["y"])

    # builders
    termlists = (
        [],
        [Term([LookupFactor("x")])],
        [Term([]), Term([LookupFactor("x")])],
    )
    builders = design_matrix_builders(termlists, lambda: iter([{
        "x": [1, 2, 3]
    }]))
    # twople but with no LHS
    t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"])
    # single DesignMatrixBuilder
    t(builders[2], {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]],
      ["Intercept", "x"])
    # twople with LHS
    t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"], [[10], [20], [30]],
      ["x"])

    # check depth arguments
    x_in_env = [1, 2, 3]
    t("~ x_in_env", {}, 0, True, [[1, 1], [1, 2], [1, 3]],
      ["Intercept", "x_in_env"])
    t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"])
    # Trying to pull x_in_env out of our *caller* shouldn't work.
    t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError))

    # But then again it should, if called from one down on the stack:
    def check_nested_call():
        x_in_env = "asdf"
        t("~ x_in_env", {}, 1, True, [[1, 1], [1, 2], [1, 3]],
          ["Intercept", "x_in_env"])

    check_nested_call()
    # passing in an explicit EvalEnvironment also works:
    e = EvalEnvironment.capture(1)
    t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError))
    e = EvalEnvironment.capture(0)

    def check_nested_call_2():
        x_in_env = "asdf"
        t("~ x_in_env", {}, e, True, [[1, 1], [1, 2], [1, 3]],
          ["Intercept", "x_in_env"])

    check_nested_call_2()
Пример #12
0
    def subset(self, which_terms):
        """Create a new :class:`DesignInfo` for design matrices that contain a
        subset of the terms that the current :class:`DesignInfo` does.

        For example, if ``design_info`` has terms ``x``, ``y``, and ``z``,
        then::

          design_info2 = design_info.subset(["x", "z"])

        will return a new DesignInfo that can be used to construct design
        matrices with only the columns corresponding to the terms ``x`` and
        ``z``. After we do this, then in general these two expressions will
        return the same thing (here we assume that ``x``, ``y``, and ``z``
        each generate a single column of the output)::

          build_design_matrix([design_info], data)[0][:, [0, 2]]
          build_design_matrix([design_info2], data)[0]

        However, a critical difference is that in the second case, ``data``
        need not contain any values for ``y``. This is very useful when doing
        prediction using a subset of a model, in which situation R usually
        forces you to specify dummy values for ``y``.

        If using a formula to specify the terms to include, remember that like
        any formula, the intercept term will be included by default, so use
        ``0`` or ``-1`` in your formula if you want to avoid this.

        This method can also be used to reorder the terms in your design
        matrix, in case you want to do that for some reason. I can't think of
        any.

        Note that this method will generally *not* produce the same result as
        creating a new model directly. Consider these DesignInfo objects::

            design1 = dmatrix("1 + C(a)", data)
            design2 = design1.subset("0 + C(a)")
            design3 = dmatrix("0 + C(a)", data)

        Here ``design2`` and ``design3`` will both produce design matrices
        that contain an encoding of ``C(a)`` without any intercept term. But
        ``design3`` uses a full-rank encoding for the categorical term
        ``C(a)``, while ``design2`` uses the same reduced-rank encoding as
        ``design1``.

        :arg which_terms: The terms which should be kept in the new
          :class:`DesignMatrixBuilder`. If this is a string, then it is parsed
          as a formula, and then the names of the resulting terms are taken as
          the terms to keep. If it is a list, then it can contain a mixture of
          term names (as strings) and :class:`Term` objects.

        .. versionadded: 0.2.0
           New method on the class DesignMatrixBuilder.

        .. versionchanged: 0.4.0
           Moved from DesignMatrixBuilder to DesignInfo, as part of the
           removal of DesignMatrixBuilder.

        """
        if isinstance(which_terms, str):
            desc = ModelDesc.from_formula(which_terms)
            if desc.lhs_termlist:
                raise PatsyError("right-hand-side-only formula required")
            which_terms = [term.name() for term in desc.rhs_termlist]

        if self.term_codings is None:
            # This is a minimal DesignInfo
            # If the name is unknown we just let the KeyError escape
            new_names = []
            for t in which_terms:
                new_names += self.column_names[self.term_name_slices[t]]
            return DesignInfo(new_names)
        else:
            term_name_to_term = {}
            for term in self.term_codings:
                term_name_to_term[term.name()] = term

            new_column_names = []
            new_factor_infos = {}
            new_term_codings = OrderedDict()
            for name_or_term in which_terms:
                term = term_name_to_term.get(name_or_term, name_or_term)
                # If the name is unknown we just let the KeyError escape
                s = self.term_slices[term]
                new_column_names += self.column_names[s]
                for f in term.factors:
                    new_factor_infos[f] = self.factor_infos[f]
                new_term_codings[term] = self.term_codings[term]
            return DesignInfo(new_column_names,
                              factor_infos=new_factor_infos,
                              term_codings=new_term_codings)
Пример #13
0
    def subset(self, which_terms):
        """Create a new :class:`DesignInfo` for design matrices that contain a
        subset of the terms that the current :class:`DesignInfo` does.

        For example, if ``design_info`` has terms ``x``, ``y``, and ``z``,
        then::

          design_info2 = design_info.subset(["x", "z"])

        will return a new DesignInfo that can be used to construct design
        matrices with only the columns corresponding to the terms ``x`` and
        ``z``. After we do this, then in general these two expressions will
        return the same thing (here we assume that ``x``, ``y``, and ``z``
        each generate a single column of the output)::

          build_design_matrix([design_info], data)[0][:, [0, 2]]
          build_design_matrix([design_info2], data)[0]

        However, a critical difference is that in the second case, ``data``
        need not contain any values for ``y``. This is very useful when doing
        prediction using a subset of a model, in which situation R usually
        forces you to specify dummy values for ``y``.

        If using a formula to specify the terms to include, remember that like
        any formula, the intercept term will be included by default, so use
        ``0`` or ``-1`` in your formula if you want to avoid this.

        This method can also be used to reorder the terms in your design
        matrix, in case you want to do that for some reason. I can't think of
        any.

        Note that this method will generally *not* produce the same result as
        creating a new model directly. Consider these DesignInfo objects::

            design1 = dmatrix("1 + C(a)", data)
            design2 = design1.subset("0 + C(a)")
            design3 = dmatrix("0 + C(a)", data)

        Here ``design2`` and ``design3`` will both produce design matrices
        that contain an encoding of ``C(a)`` without any intercept term. But
        ``design3`` uses a full-rank encoding for the categorical term
        ``C(a)``, while ``design2`` uses the same reduced-rank encoding as
        ``design1``.

        :arg which_terms: The terms which should be kept in the new
          :class:`DesignMatrixBuilder`. If this is a string, then it is parsed
          as a formula, and then the names of the resulting terms are taken as
          the terms to keep. If it is a list, then it can contain a mixture of
          term names (as strings) and :class:`Term` objects.

        .. versionadded: 0.2.0
           New method on the class DesignMatrixBuilder.

        .. versionchanged: 0.4.0
           Moved from DesignMatrixBuilder to DesignInfo, as part of the
           removal of DesignMatrixBuilder.

        """
        if isinstance(which_terms, str):
            desc = ModelDesc.from_formula(which_terms)
            if desc.lhs_termlist:
                raise PatsyError("right-hand-side-only formula required")
            which_terms = [term.name() for term in desc.rhs_termlist]

        if self.term_codings is None:
            # This is a minimal DesignInfo
            # If the name is unknown we just let the KeyError escape
            new_names = []
            for t in which_terms:
                new_names += self.column_names[self.term_name_slices[t]]
            return DesignInfo(new_names)
        else:
            term_name_to_term = {}
            for term in self.term_codings:
                term_name_to_term[term.name()] = term

            new_column_names = []
            new_factor_infos = {}
            new_term_codings = OrderedDict()
            for name_or_term in which_terms:
                term = term_name_to_term.get(name_or_term, name_or_term)
                # If the name is unknown we just let the KeyError escape
                s = self.term_slices[term]
                new_column_names += self.column_names[s]
                for f in term.factors:
                    new_factor_infos[f] = self.factor_infos[f]
                new_term_codings[term] = self.term_codings[term]
            return DesignInfo(new_column_names,
                              factor_infos=new_factor_infos,
                              term_codings=new_term_codings)
Пример #14
0
    def subset(self, which_terms):
        """Create a new :class:`DesignMatrixBuilder` that includes only a
        subset of the terms that this object does.

        For example, if `builder` has terms `x`, `y`, and `z`, then::

          builder2 = builder.subset(["x", "z"])

        will return a new builder that will return design matrices with only
        the columns corresponding to the terms `x` and `z`. After we do this,
        then in general these two expressions will return the same thing (here
        we assume that `x`, `y`, and `z` each generate a single column of the
        output)::

          build_design_matrix([builder], data)[0][:, [0, 2]]
          build_design_matrix([builder2], data)[0]

        However, a critical difference is that in the second case, `data` need
        not contain any values for `y`. This is very useful when doing
        prediction using a subset of a model, in which situation R usually
        forces you to specify dummy values for `y`.

        If using a formula to specify the terms to include, remember that like
        any formula, the intercept term will be included by default, so use
        `0` or `-1` in your formula if you want to avoid this.

        :arg which_terms: The terms which should be kept in the new
          :class:`DesignMatrixBuilder`. If this is a string, then it is parsed
          as a formula, and then the names of the resulting terms are taken as
          the terms to keep. If it is a list, then it can contain a mixture of
          term names (as strings) and :class:`Term` objects.

        .. versionadded: 0.2.0
        """
        factor_to_evaluators = {}
        for evaluator in self._evaluators:
            factor_to_evaluators[evaluator.factor] = evaluator
        design_info = self.design_info
        term_name_to_term = dict(zip(design_info.term_names,
                                     design_info.terms))
        if isinstance(which_terms, str):
            # We don't use this EvalEnvironment -- all we want to do is to
            # find matching terms, and we can't do that use == on Term
            # objects, because that calls == on factor objects, which in turn
            # compares EvalEnvironments. So all we do with the parsed formula
            # is pull out the term *names*, which the EvalEnvironment doesn't
            # effect. This is just a placeholder then to allow the ModelDesc
            # to be created:
            env = EvalEnvironment({})
            desc = ModelDesc.from_formula(which_terms, env)
            if desc.lhs_termlist:
                raise PatsyError("right-hand-side-only formula required")
            which_terms = [term.name() for term in desc.rhs_termlist]
        terms = []
        evaluators = set()
        term_to_column_builders = {}
        for term_or_name in which_terms:
            if isinstance(term_or_name, six.string_types):
                if term_or_name not in term_name_to_term:
                    raise PatsyError("requested term %r not found in "
                                     "this DesignMatrixBuilder"
                                     % (term_or_name,))
                term = term_name_to_term[term_or_name]
            else:
                term = term_or_name
            if term not in self._termlist:
                raise PatsyError("requested term '%s' not found in this "
                                 "DesignMatrixBuilder" % (term,))
            for factor in term.factors:
                evaluators.add(factor_to_evaluators[factor])
            terms.append(term)
            column_builder = self._term_to_column_builders[term]
            term_to_column_builders[term] = column_builder
        return DesignMatrixBuilder(terms,
                                   evaluators,
                                   term_to_column_builders)
Пример #15
0
    def subset(self, which_terms):
        """Create a new :class:`DesignMatrixBuilder` that includes only a
        subset of the terms that this object does.

        For example, if `builder` has terms `x`, `y`, and `z`, then::

          builder2 = builder.subset(["x", "z"])

        will return a new builder that will return design matrices with only
        the columns corresponding to the terms `x` and `z`. After we do this,
        then in general these two expressions will return the same thing (here
        we assume that `x`, `y`, and `z` each generate a single column of the
        output)::

          build_design_matrix([builder], data)[0][:, [0, 2]]
          build_design_matrix([builder2], data)[0]

        However, a critical difference is that in the second case, `data` need
        not contain any values for `y`. This is very useful when doing
        prediction using a subset of a model, in which situation R usually
        forces you to specify dummy values for `y`.

        If using a formula to specify the terms to include, remember that like
        any formula, the intercept term will be included by default, so use
        `0` or `-1` in your formula if you want to avoid this.

        :arg which_terms: The terms which should be kept in the new
          :class:`DesignMatrixBuilder`. If this is a string, then it is parsed
          as a formula, and then the names of the resulting terms are taken as
          the terms to keep. If it is a list, then it can contain a mixture of
          term names (as strings) and :class:`Term` objects.

        .. versionadded: 0.2.0
        """
        factor_to_evaluators = {}
        for evaluator in self._evaluators:
            factor_to_evaluators[evaluator.factor] = evaluator
        design_info = self.design_info
        term_name_to_term = dict(zip(design_info.term_names,
                                     design_info.terms))
        if isinstance(which_terms, basestring):
            # We don't use this EvalEnvironment -- all we want to do is to
            # find matching terms, and we can't do that use == on Term
            # objects, because that calls == on factor objects, which in turn
            # compares EvalEnvironments. So all we do with the parsed formula
            # is pull out the term *names*, which the EvalEnvironment doesn't
            # effect. This is just a placeholder then to allow the ModelDesc
            # to be created:
            env = EvalEnvironment({})
            desc = ModelDesc.from_formula(which_terms, env)
            if desc.lhs_termlist:
                raise PatsyError("right-hand-side-only formula required")
            which_terms = [term.name() for term in desc.rhs_termlist]
        terms = []
        evaluators = set()
        term_to_column_builders = {}
        for term_or_name in which_terms:
            if isinstance(term_or_name, basestring):
                if term_or_name not in term_name_to_term:
                    raise PatsyError("requested term %r not found in "
                                     "this DesignMatrixBuilder" %
                                     (term_or_name, ))
                term = term_name_to_term[term_or_name]
            else:
                term = term_or_name
            if term not in self._termlist:
                raise PatsyError("requested term '%s' not found in this "
                                 "DesignMatrixBuilder" % (term, ))
            for factor in term.factors:
                evaluators.add(factor_to_evaluators[factor])
            terms.append(term)
            column_builder = self._term_to_column_builders[term]
            term_to_column_builders[term] = column_builder
        return DesignMatrixBuilder(terms, evaluators, term_to_column_builders)
Пример #16
0
def group_design(
    spreadsheet: Path,
    contrasts: list[dict],
    variables: list[dict],
    subjects: list[str],
) -> tuple[dict[str, list[float]], list[tuple], list[str], list[str]]:

    dataframe = prepare_data_frame(spreadsheet, variables, subjects)

    # remove zero variance columns
    columns_var_gt_0 = dataframe.apply(
        pd.Series.nunique) > 1  # does not count NA
    assert isinstance(columns_var_gt_0, pd.Series)
    dataframe = dataframe.loc[:, columns_var_gt_0]

    # don't need to specify lhs
    lhs: list[Term] = []

    # generate rhs
    rhs = _generate_rhs(contrasts, columns_var_gt_0)

    # specify patsy design matrix
    modelDesc = ModelDesc(lhs, rhs)
    dmat = dmatrix(modelDesc, dataframe, return_type="dataframe")
    _check_multicollinearity(dmat)

    # prepare lsmeans
    unique_values_categorical = [
        (0.0, ) if is_numeric_dtype(dataframe[f]) else dataframe[f].unique()
        for f in dataframe.columns
    ]
    grid = pd.DataFrame(list(product(*unique_values_categorical)),
                        columns=dataframe.columns)
    reference_dmat = dmatrix(dmat.design_info, grid, return_type="dataframe")

    # data frame to store contrasts
    contrast_matrices: list[tuple[str, pd.DataFrame]] = []

    for field, columnslice in dmat.design_info.term_name_slices.items():
        constraint = {
            column: 0
            for column in dmat.design_info.column_names[columnslice]
        }
        contrast = dmat.design_info.linear_constraint(constraint)

        assert np.all(contrast.variable_names == dmat.columns)

        contrast_matrix = pd.DataFrame(contrast.coefs, columns=dmat.columns)

        if field == "Intercept":  # do not capitalize
            field = field.lower()
        contrast_matrices.append((field, contrast_matrix))

    for contrast in contrasts:
        if contrast["type"] == "t":
            (variable, ) = contrast["variable"]
            variable_levels: list[str] = list(dataframe[variable].unique())

            # Generate the lsmeans matrix where there is one row for each
            # factor level. Each row is a contrast vector.
            # This contrast vector corresponds to the mean of the dependent
            # variable at the factor level.
            # For example, we would have one row that calculates the mean
            # for patients, and one for controls.

            lsmeans = pd.DataFrame(index=variable_levels, columns=dmat.columns)
            for level in variable_levels:
                reference_rows = reference_dmat.loc[grid[variable] == level]
                lsmeans.loc[level] = reference_rows.mean()

            value_dict = contrast["values"]
            names = [
                name for name in value_dict.keys() if name in variable_levels
            ]
            values = [value_dict[name] for name in names]

            # If we wish to test the mean of each group against zero,
            # we can simply use these contrasts and be done.
            # To test a linear hypothesis such as patient-control=0,
            # which is expressed here as {"patient":1, "control":-1},
            # we translate it to a contrast vector by taking the linear
            # combination of the lsmeans contrasts.

            contrast_vector = lsmeans.loc[names].mul(values, axis=0).sum()
            contrast_matrix = pd.DataFrame([contrast_vector],
                                           columns=dmat.columns)

            contrast_name = f"{contrast['name']}"
            contrast_matrices.append((contrast_name, contrast_matrix))

    npts, nevs = dmat.shape

    if nevs >= npts:
        logger.warning("Reverting to simple intercept only design. \n"
                       f"nevs ({nevs}) >= npts ({npts})")
        return intercept_only_design(len(subjects))

    regressor_list = dmat.to_dict(orient="list", into=OrderedDict)
    contrast_list, contrast_numbers, contrast_names = _make_contrasts_list(
        contrast_matrices)

    return regressor_list, contrast_list, contrast_numbers, contrast_names
print(data[["Label", "f1", "f2", data.columns[-1]]].head())

###################################################
# Let's train a logistic regression.

formula = "Label ~ {0}".format(" + ".join(data.columns[1:]))
print(formula[:50] + " + ...")

from microsoftml import rx_logistic_regression

try:
    logregml = rx_logistic_regression(formula, data=data)
except Exception as e:
    # The error is expected because patsy cannot handle
    # so many features.
    print(e)

#########################################
# Let's skip patsy's parser to manually define the formula
# with object `ModelDesc <http://patsy.readthedocs.io/en/latest/API-reference.html?highlight=lookupfactor#patsy.ModelDesc>`_.

from patsy.desc import ModelDesc, Term
from patsy.user_util import LookupFactor

patsy_features = [Term([LookupFactor(n)]) for n in data.columns[1:]][:10]
model_formula = ModelDesc([Term([LookupFactor("Label")])], [Term([])] + patsy_features)

print(model_formula.describe() + " + ...")
logregml = rx_logistic_regression(model_formula, data=data)