示例#1
0
文件: mixed.py 项目: turbach/mixed
def get_matrices(data, formula, env=0):
    """Given the data and a formula, build Z and X matrices."""
    model_description = evaluate_formula(formula)

    fixef_terms, randef_terms = [], []
    for term in model_description.rhs_termlist:
        if isinstance(term, RandomEffectsTerm):
            randef_terms.append(term)
        else:
            fixef_terms.append(term)

    Zis = []
    Lambdatis = []
    thetais = []
    ps = []
    ls = []
    for ret in randef_terms:
        X = dmatrix(ret.expr, data, env)
        J = dmatrix(ret.factor, data, env)
        _, p = X.shape
        _, l = J.shape
        ps.append(p)
        ls.append(l)
        Zis.append(buildzi(X, J))
        Lambdati, thetai = buildlambdati(p, l)
        Lambdatis.append(Lambdati)
        thetais.append(thetai)

    Lind = buildlind(ps, ls)

    def thfun(theta):
        return theta[Lind]

    Z = hstack(Zis).T
    Lambdat = block_diag(Lambdatis, format='csc')

    y, X = dmatrices(ModelDesc(model_description.lhs_termlist, fixef_terms),
                     data)

    y = np.asarray(y)
    X = np.asarray(X)

    # initial value of theta
    theta0 = np.concatenate(thetais)

    return X, Z, Lambdat, y, theta0, thfun
示例#2
0
 def __patsy_get_model_desc__(self, data):
     return ModelDesc([Term([LookupFactor("Y")])],
                      [Term([LookupFactor("X")])])
示例#3
0
def test_formula_likes():
    # Plain array-like, rhs only
    t([[1, 2, 3], [4, 5, 6]], {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"])
    t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    t(dm, {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"])
    t((None, dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["foo0", "foo1", "foo2"])

    # Plain array-likes, lhs and rhs
    t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0,
      False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"])
    x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    y_dm = DesignMatrix([1, 2], default_column_prefix="bar")
    t((y_dm, x_dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]],
      ["foo0", "foo1", "foo2"], [[1], [2]], ["bar0"])
    # number of rows must match
    t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0)

    # tuples must have the right size
    t_invalid(([[1, 2, 3]], ), {}, 0)
    t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0)

    # plain Series and DataFrames
    if have_pandas:
        # Names are extracted
        t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, False, [[1], [2], [3]],
          ["x"])
        t(pandas.Series([1, 2, 3], name="asdf"), {}, 0, False, [[1], [2], [3]],
          ["asdf"])
        t((pandas.DataFrame({"y": [4, 5, 6]
                             }), pandas.DataFrame({"x": [1, 2, 3]})), {}, 0,
          False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"])
        t((pandas.Series([4, 5, 6],
                         name="y"), pandas.Series([1, 2, 3], name="x")), {}, 0,
          False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"])
        # Or invented
        t((pandas.DataFrame([[4, 5, 6]]),
           pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0, False,
          [[1, 2, 3]], ["x7", "x8", "x9"], [[4, 5, 6]], ["y0", "y1", "y2"])
        t(pandas.Series([1, 2, 3]), {}, 0, False, [[1], [2], [3]], ["x0"])
        # indices must match
        t_invalid((pandas.DataFrame(
            [[1]], index=[1]), pandas.DataFrame([[1]], index=[2])), {}, 0)

    # Foreign ModelDesc factories
    class ForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return ModelDesc([Term([LookupFactor("Y")])],
                             [Term([LookupFactor("X")])])

    foreign_model = ForeignModelSource()
    t(foreign_model, {
        "Y": [1, 2],
        "X": [[1, 2], [3, 4]]
    }, 0, True, [[1, 2], [3, 4]], ["X[0]", "X[1]"], [[1], [2]], ["Y"])

    class BadForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return data

    t_invalid(BadForeignModelSource(), {}, 0)

    # string formulas
    t("y ~ x", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"], [[1], [2]], ["y"])
    t("~ x", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"])
    t("x + y", {
        "y": [1, 2],
        "x": [3, 4]
    }, 0, True, [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"])

    # ModelDesc
    desc = ModelDesc([], [Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1.5], [2.5], [3.5]], ["x"])
    desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]],
      ["Intercept", "x"])
    desc = ModelDesc([Term([LookupFactor("y")])],
                     [Term([]), Term([LookupFactor("x")])])
    t(desc, {
        "x": [1.5, 2.5, 3.5],
        "y": [10, 20, 30]
    }, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"],
      [[10], [20], [30]], ["y"])

    # builders
    termlists = (
        [],
        [Term([LookupFactor("x")])],
        [Term([]), Term([LookupFactor("x")])],
    )
    builders = design_matrix_builders(termlists, lambda: iter([{
        "x": [1, 2, 3]
    }]))
    # twople but with no LHS
    t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"])
    # single DesignMatrixBuilder
    t(builders[2], {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]],
      ["Intercept", "x"])
    # twople with LHS
    t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"], [[10], [20], [30]],
      ["x"])

    # check depth arguments
    x_in_env = [1, 2, 3]
    t("~ x_in_env", {}, 0, True, [[1, 1], [1, 2], [1, 3]],
      ["Intercept", "x_in_env"])
    t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0, True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"])
    # Trying to pull x_in_env out of our *caller* shouldn't work.
    t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError))

    # But then again it should, if called from one down on the stack:
    def check_nested_call():
        x_in_env = "asdf"
        t("~ x_in_env", {}, 1, True, [[1, 1], [1, 2], [1, 3]],
          ["Intercept", "x_in_env"])

    check_nested_call()
    # passing in an explicit EvalEnvironment also works:
    e = EvalEnvironment.capture(1)
    t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError))
    e = EvalEnvironment.capture(0)

    def check_nested_call_2():
        x_in_env = "asdf"
        t("~ x_in_env", {}, e, True, [[1, 1], [1, 2], [1, 3]],
          ["Intercept", "x_in_env"])

    check_nested_call_2()
示例#4
0
def group_design(
    spreadsheet: Path,
    contrasts: list[dict],
    variables: list[dict],
    subjects: list[str],
) -> tuple[dict[str, list[float]], list[tuple], list[str], list[str]]:

    dataframe = prepare_data_frame(spreadsheet, variables, subjects)

    # remove zero variance columns
    columns_var_gt_0 = dataframe.apply(
        pd.Series.nunique) > 1  # does not count NA
    assert isinstance(columns_var_gt_0, pd.Series)
    dataframe = dataframe.loc[:, columns_var_gt_0]

    # don't need to specify lhs
    lhs: list[Term] = []

    # generate rhs
    rhs = _generate_rhs(contrasts, columns_var_gt_0)

    # specify patsy design matrix
    modelDesc = ModelDesc(lhs, rhs)
    dmat = dmatrix(modelDesc, dataframe, return_type="dataframe")
    _check_multicollinearity(dmat)

    # prepare lsmeans
    unique_values_categorical = [
        (0.0, ) if is_numeric_dtype(dataframe[f]) else dataframe[f].unique()
        for f in dataframe.columns
    ]
    grid = pd.DataFrame(list(product(*unique_values_categorical)),
                        columns=dataframe.columns)
    reference_dmat = dmatrix(dmat.design_info, grid, return_type="dataframe")

    # data frame to store contrasts
    contrast_matrices: list[tuple[str, pd.DataFrame]] = []

    for field, columnslice in dmat.design_info.term_name_slices.items():
        constraint = {
            column: 0
            for column in dmat.design_info.column_names[columnslice]
        }
        contrast = dmat.design_info.linear_constraint(constraint)

        assert np.all(contrast.variable_names == dmat.columns)

        contrast_matrix = pd.DataFrame(contrast.coefs, columns=dmat.columns)

        if field == "Intercept":  # do not capitalize
            field = field.lower()
        contrast_matrices.append((field, contrast_matrix))

    for contrast in contrasts:
        if contrast["type"] == "t":
            (variable, ) = contrast["variable"]
            variable_levels: list[str] = list(dataframe[variable].unique())

            # Generate the lsmeans matrix where there is one row for each
            # factor level. Each row is a contrast vector.
            # This contrast vector corresponds to the mean of the dependent
            # variable at the factor level.
            # For example, we would have one row that calculates the mean
            # for patients, and one for controls.

            lsmeans = pd.DataFrame(index=variable_levels, columns=dmat.columns)
            for level in variable_levels:
                reference_rows = reference_dmat.loc[grid[variable] == level]
                lsmeans.loc[level] = reference_rows.mean()

            value_dict = contrast["values"]
            names = [
                name for name in value_dict.keys() if name in variable_levels
            ]
            values = [value_dict[name] for name in names]

            # If we wish to test the mean of each group against zero,
            # we can simply use these contrasts and be done.
            # To test a linear hypothesis such as patient-control=0,
            # which is expressed here as {"patient":1, "control":-1},
            # we translate it to a contrast vector by taking the linear
            # combination of the lsmeans contrasts.

            contrast_vector = lsmeans.loc[names].mul(values, axis=0).sum()
            contrast_matrix = pd.DataFrame([contrast_vector],
                                           columns=dmat.columns)

            contrast_name = f"{contrast['name']}"
            contrast_matrices.append((contrast_name, contrast_matrix))

    npts, nevs = dmat.shape

    if nevs >= npts:
        logger.warning("Reverting to simple intercept only design. \n"
                       f"nevs ({nevs}) >= npts ({npts})")
        return intercept_only_design(len(subjects))

    regressor_list = dmat.to_dict(orient="list", into=OrderedDict)
    contrast_list, contrast_numbers, contrast_names = _make_contrasts_list(
        contrast_matrices)

    return regressor_list, contrast_list, contrast_numbers, contrast_names
print(data[["Label", "f1", "f2", data.columns[-1]]].head())

###################################################
# Let's train a logistic regression.

formula = "Label ~ {0}".format(" + ".join(data.columns[1:]))
print(formula[:50] + " + ...")

from microsoftml import rx_logistic_regression

try:
    logregml = rx_logistic_regression(formula, data=data)
except Exception as e:
    # The error is expected because patsy cannot handle
    # so many features.
    print(e)

#########################################
# Let's skip patsy's parser to manually define the formula
# with object `ModelDesc <http://patsy.readthedocs.io/en/latest/API-reference.html?highlight=lookupfactor#patsy.ModelDesc>`_.

from patsy.desc import ModelDesc, Term
from patsy.user_util import LookupFactor

patsy_features = [Term([LookupFactor(n)]) for n in data.columns[1:]][:10]
model_formula = ModelDesc([Term([LookupFactor("Label")])], [Term([])] + patsy_features)

print(model_formula.describe() + " + ...")
logregml = rx_logistic_regression(model_formula, data=data)