def get_matrices(data, formula, env=0): """Given the data and a formula, build Z and X matrices.""" model_description = evaluate_formula(formula) fixef_terms, randef_terms = [], [] for term in model_description.rhs_termlist: if isinstance(term, RandomEffectsTerm): randef_terms.append(term) else: fixef_terms.append(term) Zis = [] Lambdatis = [] thetais = [] ps = [] ls = [] for ret in randef_terms: X = dmatrix(ret.expr, data, env) J = dmatrix(ret.factor, data, env) _, p = X.shape _, l = J.shape ps.append(p) ls.append(l) Zis.append(buildzi(X, J)) Lambdati, thetai = buildlambdati(p, l) Lambdatis.append(Lambdati) thetais.append(thetai) Lind = buildlind(ps, ls) def thfun(theta): return theta[Lind] Z = hstack(Zis).T Lambdat = block_diag(Lambdatis, format='csc') y, X = dmatrices(ModelDesc(model_description.lhs_termlist, fixef_terms), data) y = np.asarray(y) X = np.asarray(X) # initial value of theta theta0 = np.concatenate(thetais) return X, Z, Lambdat, y, theta0, thfun
def __patsy_get_model_desc__(self, data): return ModelDesc([Term([LookupFactor("Y")])], [Term([LookupFactor("X")])])
def test_formula_likes(): # Plain array-like, rhs only t([[1, 2, 3], [4, 5, 6]], {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") t(dm, {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) t((None, dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) # Plain array-likes, lhs and rhs t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") y_dm = DesignMatrix([1, 2], default_column_prefix="bar") t((y_dm, x_dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"], [[1], [2]], ["bar0"]) # number of rows must match t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0) # tuples must have the right size t_invalid(([[1, 2, 3]], ), {}, 0) t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0) # plain Series and DataFrames if have_pandas: # Names are extracted t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, False, [[1], [2], [3]], ["x"]) t(pandas.Series([1, 2, 3], name="asdf"), {}, 0, False, [[1], [2], [3]], ["asdf"]) t((pandas.DataFrame({"y": [4, 5, 6] }), pandas.DataFrame({"x": [1, 2, 3]})), {}, 0, False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"]) t((pandas.Series([4, 5, 6], name="y"), pandas.Series([1, 2, 3], name="x")), {}, 0, False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"]) # Or invented t((pandas.DataFrame([[4, 5, 6]]), pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0, False, [[1, 2, 3]], ["x7", "x8", "x9"], [[4, 5, 6]], ["y0", "y1", "y2"]) t(pandas.Series([1, 2, 3]), {}, 0, False, [[1], [2], [3]], ["x0"]) # indices must match t_invalid((pandas.DataFrame( [[1]], index=[1]), pandas.DataFrame([[1]], index=[2])), {}, 0) # Foreign ModelDesc factories class ForeignModelSource(object): def __patsy_get_model_desc__(self, data): return ModelDesc([Term([LookupFactor("Y")])], [Term([LookupFactor("X")])]) foreign_model = ForeignModelSource() t(foreign_model, { "Y": [1, 2], "X": [[1, 2], [3, 4]] }, 0, True, [[1, 2], [3, 4]], ["X[0]", "X[1]"], [[1], [2]], ["Y"]) class BadForeignModelSource(object): def __patsy_get_model_desc__(self, data): return data t_invalid(BadForeignModelSource(), {}, 0) # string formulas t("y ~ x", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"], [[1], [2]], ["y"]) t("~ x", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"]) t("x + y", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"]) # ModelDesc desc = ModelDesc([], [Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1.5], [2.5], [3.5]], ["x"]) desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"]) desc = ModelDesc([Term([LookupFactor("y")])], [Term([]), Term([LookupFactor("x")])]) t(desc, { "x": [1.5, 2.5, 3.5], "y": [10, 20, 30] }, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"], [[10], [20], [30]], ["y"]) # builders termlists = ( [], [Term([LookupFactor("x")])], [Term([]), Term([LookupFactor("x")])], ) builders = design_matrix_builders(termlists, lambda: iter([{ "x": [1, 2, 3] }])) # twople but with no LHS t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) # single DesignMatrixBuilder t(builders[2], {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) # twople with LHS t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"], [[10], [20], [30]], ["x"]) # check depth arguments x_in_env = [1, 2, 3] t("~ x_in_env", {}, 0, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"]) # Trying to pull x_in_env out of our *caller* shouldn't work. t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError)) # But then again it should, if called from one down on the stack: def check_nested_call(): x_in_env = "asdf" t("~ x_in_env", {}, 1, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) check_nested_call() # passing in an explicit EvalEnvironment also works: e = EvalEnvironment.capture(1) t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError)) e = EvalEnvironment.capture(0) def check_nested_call_2(): x_in_env = "asdf" t("~ x_in_env", {}, e, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) check_nested_call_2()
def group_design( spreadsheet: Path, contrasts: list[dict], variables: list[dict], subjects: list[str], ) -> tuple[dict[str, list[float]], list[tuple], list[str], list[str]]: dataframe = prepare_data_frame(spreadsheet, variables, subjects) # remove zero variance columns columns_var_gt_0 = dataframe.apply( pd.Series.nunique) > 1 # does not count NA assert isinstance(columns_var_gt_0, pd.Series) dataframe = dataframe.loc[:, columns_var_gt_0] # don't need to specify lhs lhs: list[Term] = [] # generate rhs rhs = _generate_rhs(contrasts, columns_var_gt_0) # specify patsy design matrix modelDesc = ModelDesc(lhs, rhs) dmat = dmatrix(modelDesc, dataframe, return_type="dataframe") _check_multicollinearity(dmat) # prepare lsmeans unique_values_categorical = [ (0.0, ) if is_numeric_dtype(dataframe[f]) else dataframe[f].unique() for f in dataframe.columns ] grid = pd.DataFrame(list(product(*unique_values_categorical)), columns=dataframe.columns) reference_dmat = dmatrix(dmat.design_info, grid, return_type="dataframe") # data frame to store contrasts contrast_matrices: list[tuple[str, pd.DataFrame]] = [] for field, columnslice in dmat.design_info.term_name_slices.items(): constraint = { column: 0 for column in dmat.design_info.column_names[columnslice] } contrast = dmat.design_info.linear_constraint(constraint) assert np.all(contrast.variable_names == dmat.columns) contrast_matrix = pd.DataFrame(contrast.coefs, columns=dmat.columns) if field == "Intercept": # do not capitalize field = field.lower() contrast_matrices.append((field, contrast_matrix)) for contrast in contrasts: if contrast["type"] == "t": (variable, ) = contrast["variable"] variable_levels: list[str] = list(dataframe[variable].unique()) # Generate the lsmeans matrix where there is one row for each # factor level. Each row is a contrast vector. # This contrast vector corresponds to the mean of the dependent # variable at the factor level. # For example, we would have one row that calculates the mean # for patients, and one for controls. lsmeans = pd.DataFrame(index=variable_levels, columns=dmat.columns) for level in variable_levels: reference_rows = reference_dmat.loc[grid[variable] == level] lsmeans.loc[level] = reference_rows.mean() value_dict = contrast["values"] names = [ name for name in value_dict.keys() if name in variable_levels ] values = [value_dict[name] for name in names] # If we wish to test the mean of each group against zero, # we can simply use these contrasts and be done. # To test a linear hypothesis such as patient-control=0, # which is expressed here as {"patient":1, "control":-1}, # we translate it to a contrast vector by taking the linear # combination of the lsmeans contrasts. contrast_vector = lsmeans.loc[names].mul(values, axis=0).sum() contrast_matrix = pd.DataFrame([contrast_vector], columns=dmat.columns) contrast_name = f"{contrast['name']}" contrast_matrices.append((contrast_name, contrast_matrix)) npts, nevs = dmat.shape if nevs >= npts: logger.warning("Reverting to simple intercept only design. \n" f"nevs ({nevs}) >= npts ({npts})") return intercept_only_design(len(subjects)) regressor_list = dmat.to_dict(orient="list", into=OrderedDict) contrast_list, contrast_numbers, contrast_names = _make_contrasts_list( contrast_matrices) return regressor_list, contrast_list, contrast_numbers, contrast_names
print(data[["Label", "f1", "f2", data.columns[-1]]].head()) ################################################### # Let's train a logistic regression. formula = "Label ~ {0}".format(" + ".join(data.columns[1:])) print(formula[:50] + " + ...") from microsoftml import rx_logistic_regression try: logregml = rx_logistic_regression(formula, data=data) except Exception as e: # The error is expected because patsy cannot handle # so many features. print(e) ######################################### # Let's skip patsy's parser to manually define the formula # with object `ModelDesc <http://patsy.readthedocs.io/en/latest/API-reference.html?highlight=lookupfactor#patsy.ModelDesc>`_. from patsy.desc import ModelDesc, Term from patsy.user_util import LookupFactor patsy_features = [Term([LookupFactor(n)]) for n in data.columns[1:]][:10] model_formula = ModelDesc([Term([LookupFactor("Label")])], [Term([])] + patsy_features) print(model_formula.describe() + " + ...") logregml = rx_logistic_regression(model_formula, data=data)