Exemplo n.º 1
0
    def test_basic(self):
        a = khatri_rao(array([[1, 2], [3, 4]]), array([[5, 6], [7, 8]]))

        assert_array_equal(a, array([[5, 12], [7, 16], [15, 24], [21, 32]]))

        b = khatri_rao(np.empty([2, 2]), np.empty([2, 2]))
        assert_array_equal(b.shape, (4, 2))
Exemplo n.º 2
0
 def test_number_of_columns_equality(self):
     with pytest.raises(ValueError):
         a = array([[1, 2, 3],
                    [4, 5, 6]])
         b = array([[1, 2],
                    [3, 4]])
         khatri_rao(a, b)
def ALS_solver(X, r, nmax = 1000, err_tol = 1e-4):
    """

    Parameters
    ----------
    X : tensor like B1
    r : tensor rank
    nmax : maximum number of iterations
        The default is 1000.
    err_tol : tolerance for relative residual error, optional
        The default is 1e-4.

    Returns
    -------
    A : matrix with size n by r
    B : matrix with size n by r
    C : matrix with size n by r
    X_hat : approximated tensor with same shape as X

    """
    n3, n2, n1 = X.shape
    B = np.random.normal(0, 1, (n2, r))
    C = np.random.normal(0, 1, (n3, r))
    
    X1 = tensor2matrix(X, 1)
    X2 = tensor2matrix(X, 2)
    X3 = tensor2matrix(X, 3)
    
    X_norm = lin.norm(X1, 'fro')
    err = np.inf
    
    B = col_normalize(B)
    i = 0
    while (err >= err_tol) and i < nmax:
        C = col_normalize(C)
        tem1 = lin.khatri_rao(C, B)
        A, res, rnk, s = lin.lstsq(tem1, X1.T)
        A = A.T
        
        A = col_normalize(A)
        tem2 = lin.khatri_rao(C, A)
        B, res, rnk, s = lin.lstsq(tem2, X2.T)
        B = B.T
        
        B = col_normalize(B)
        tem3 = lin.khatri_rao(B, A)
        C, res, rnk, s = lin.lstsq(tem3, X3.T)
        C = C.T
        
        X_hat1 = A.dot(lin.khatri_rao(C, B).T)
        err = lin.norm(X_hat1 - X1, 'fro') / X_norm
        i += 1
        print('Relative error at iteration ', i, ': ', err)
    X_hat = matrix2tensor(X_hat1, X.shape)
    print('Finished!')
    return A, B, C, X_hat
def compute_vec_tensor(U, V, W):
    """
    Return vectorized tensor from CP decomposition 
    """

    # return np.sum(lin.khatri_rao(lin.khatri_rao(U, V), W), axis = 1)
    r = U.shape[1]
    out = 0
    for i in range(r):
        out += lin.khatri_rao(lin.khatri_rao(np.expand_dims(U[:, i], 1), np.expand_dims(V[:, i], 1)), np.expand_dims(W[:, i], 1))
    return np.squeeze(out)
Exemplo n.º 5
0
def perform_CMTF(tOrig=None, mOrig=None, r=10):
    """ Perform CMTF decomposition. """
    if tOrig is None:
        tOrig, mOrig = createCube()

    tFac = CPTensor(initialize_cp(np.nan_to_num(tOrig, nan=np.nanmean(tOrig)), r, non_negative=True))
    mFac = CPTensor(initialize_cp(np.nan_to_num(mOrig, nan=np.nanmean(mOrig)), r, non_negative=True))

    # Pre-unfold
    selPat = np.all(np.isfinite(mOrig), axis=1)
    unfolded = tl.unfold(tOrig, 0)
    missing = np.any(np.isnan(unfolded), axis=0)
    unfolded = unfolded[:, ~missing]

    R2X = -1.0
    mFac.factors[0] = tFac.factors[0]
    mFac.factors[1] = np.linalg.lstsq(mFac.factors[0][selPat, :], mOrig[selPat, :], rcond=None)[0].T

    for ii in range(8000):
        # Solve for the subject matrix
        kr = khatri_rao(tFac.factors[1], tFac.factors[2])[~missing, :]
        kr2 = np.vstack((kr, mFac.factors[1]))
        unfolded2 = np.hstack((unfolded, mOrig))

        tFac.factors[0] = censored_lstsq(kr2, unfolded2.T)
        mFac.factors[0] = tFac.factors[0]

        # PARAFAC on other antigen modes
        for m in [1, 2]:
            kr = khatri_rao(tFac.factors[0], tFac.factors[3 - m])
            unfold = tl.unfold(tOrig, m)
            tFac.factors[m] = censored_lstsq(kr, unfold.T)

        # Solve for the glycan matrix fit
        mFac.factors[1] = np.linalg.lstsq(mFac.factors[0][selPat, :], mOrig[selPat, :], rcond=None)[0].T

        if ii % 20 == 0:
            R2X_last = R2X
            R2X = calcR2X(tOrig, mOrig, tFac, mFac)

        if R2X - R2X_last < 1e-6:
            break

    tFac.normalize()
    mFac.normalize()

    # Reorient the later tensor factors
    tFac.factors, mFac.factors = reorient_factors(tFac.factors, mFac.factors)

    return tFac, mFac, R2X
Exemplo n.º 6
0
    def eval(self, data, eval_env, encoding):
        # TODO: factor can't be a call or interaction yet.
        if isinstance(self.factor, Term):
            factor = data[self.factor.variable]
            if not hasattr(factor.dtype,
                           "ordered") or not factor.dtype.ordered:
                categories = sorted(factor.unique().tolist())
                cat_type = pd.api.types.CategoricalDtype(categories=categories,
                                                         ordered=True)
                factor = factor.astype(cat_type)
        else:
            raise ValueError(
                "Factor on right hand side of group specific term must be a single term."
            )

        # Notation as in lme4 paper
        Ji = pd.get_dummies(
            factor).to_numpy()  # note we don't use `drop_first=True`.
        Xi = self.expr.eval(data, eval_env, encoding)
        Zi = linalg.khatri_rao(Ji.T, Xi["value"].T).T
        out = {
            "type": Xi["type"],
            "Xi": Xi["value"],
            "Ji": Ji,
            "Zi": sparse.coo_matrix(Zi),
            "groups": factor.cat.categories.tolist(),
        }
        if Xi["type"] == "categoric":
            if "levels" in Xi.keys():
                out["levels"] = Xi["levels"]
                out["reference"] = Xi["reference"]
                out["encoding"] = Xi["encoding"]
            else:
                out["reference"] = Xi["reference"]
        return out
Exemplo n.º 7
0
    def eval_new_data(self, data):
        """Evaluates the term with new data.

        Converts the variable in ``factor`` to the type remembered from the first evaluation and
        produces the design matrix for this grouping, calls ``.eval_new_data()`` on ``self.expr``
        to obtain the design matrix for the ``expr`` side, then computes the design matrix
        corresponding to the group specific effect.

        Parameters
        ----------
        data: pd.DataFrame
            The data frame where variables are taken from.

        Returns
        ----------
        Zi: np.ndarray
        """
        Xi = self.expr.eval_new_data(data)
        Ji = self.factor.eval_new_data(data)
        if Xi.ndim == 1:
            Xi = Xi[:, np.newaxis]
        if Ji.ndim == 1:
            Ji = Ji[:, np.newaxis]
        Zi = linalg.khatri_rao(Ji.T, Xi.T).T
        return Zi
Exemplo n.º 8
0
    def test_equality_of_two_equations(self):
        a = array([[1, 2], [3, 4]])
        b = array([[5, 6], [7, 8]])

        res1 = khatri_rao(a, b)
        res2 = np.vstack(
            [np.kron(a[:, k], b[:, k]) for k in range(b.shape[1])]).T

        assert_array_equal(res1, res2)
Exemplo n.º 9
0
    def findGraphLaplacian(self):
        # dimensions
        L = self.L

        # hyperparams
        p = self.p
        beta_2 = self.beta_2

        # find the diffusion process
        H = self.findDiffusionProcess().squeeze()

        # get the eigendecomposition of the diffusion process
        Up, Vp = np.linalg.eig(H)

        Wp = np.empty((0, L, L))
        for i in range(p):
            # get eignevectors of H_p
            V = Vp[i]

            # get U
            U = khatri_rao(V[:, 1:], V[:, 1:])

            # get set D
            D, Dcomp = self.getSetDandDcomp()

            # test the feasibility of the problem
            # rank_U_D = np.linalg.matrix_rank(U[D])
            # if rank_U_D <= L-1:
            #     print("Rank of W_D : {}, Problem is Feasible".format(rank_U_D))
            # else:
            #     print("Rank of W_D : {}, Problem is infeasible".format(rank_U_D))

            # compute A
            Q_full = np.identity(L**2) - np.matmul(U, np.linalg.pinv(U))
            Q = Q_full[Dcomp]
            A = Q.T

            # compute b
            b = -np.matmul(Q_full[D].T, np.ones((L, 1)))

            # solve basis pursuit with noisy observations
            # l_Dcomp = np.array(l1regls(matrix(A / np.sqrt(beta_2)), matrix(b / np.sqrt(beta_2))))
            l_Dcomp = np.matmul(
                np.linalg.inv(
                    np.matmul(A.T, A) + beta_2 * np.identity(A.shape[1])),
                np.matmul(A.T, b))

            # obtain W from L
            l = np.zeros((L * L, ))
            l[Dcomp] = l_Dcomp.squeeze()
            Laplacian = l.reshape(L, L).T
            np.fill_diagonal(Laplacian, 1)
            Laplacian[abs(Laplacian) < 1e-5] = 0
            W = np.identity(L) - Laplacian
            Wp = np.vstack((Wp, W.reshape(1, L, L)))
        W = np.mean(Wp, axis=0)
        return W
Exemplo n.º 10
0
    def _evaluate_new_data(self, data):
        if not self.evaluated:
            raise ValueError("Can't evaluate new data on unevaluated matrix.")

        new_instance = self.__class__(self.terms)

        start_row = start_col = 0
        Z = []

        for term in self.terms:
            d = term.eval_new_data(data)
            if d["type"] == "categoric":
                levels = d["levels"] if d["encoding"] == "full" else d[
                    "levels"][1:]
                Ji = d["Ji"]
                for idx, level in enumerate(levels):
                    Xi = np.atleast_2d(d["Xi"][:, idx]).T
                    Zi = linalg.khatri_rao(Ji.T, Xi.T).T
                    delta_row, delta_col = Zi.shape
                    Z.append(Zi)
                    term_name = term.to_string(level)
                    # All the info, except from the indexes, is copied.
                    new_instance.terms_info[term_name] = deepcopy(
                        self.terms_info[term_name])
                    new_instance.terms_info[term_name]["idxs"] = (
                        slice(start_row, start_row + delta_row),
                        slice(start_col, start_col + delta_col),
                    )
                    start_row += delta_row
                    start_col += delta_col
            else:
                Zi = d["Zi"]
                delta_row, delta_col = Zi.shape
                Z.append(Zi)
                term_name = term.to_string()
                new_instance.terms_info[term_name] = deepcopy(
                    self.terms_info[term_name])
                new_instance.terms_info[term_name]["idxs"] = (
                    slice(start_row, start_row + delta_row),
                    slice(start_col, start_col + delta_col),
                )
                start_row += delta_row
                start_col += delta_col

        new_instance.data = data
        new_instance.eval_env = self.eval_env

        # Stored in Compressed Sparse Column format
        if Z:
            new_instance.design_matrix = sp.sparse.block_diag(Z).tocsc()

        return new_instance
Exemplo n.º 11
0
def recovery_matrix_RKKP_Strassen(C_tilda, saved_nodes, P, Q, l, p):
    (N, m), n = P.shape, Q.shape[1]

    G = khatri_rao(P.T, Q.T).T
    G_pinv = np.linalg.pinv(G[saved_nodes])

    C_tilda_reshaped = C_tilda.reshape(len(saved_nodes), -1)

    C_rec_reshaped = G_pinv @ C_tilda_reshaped

    C_rec = C_rec_reshaped[::m + 1]
    C_rec = np.vstack([np.split(c, l) for c in C_rec])

    return C_rec
Exemplo n.º 12
0
    def eval(self, data):
        if isinstance(self.factor, Term):
            factor = data[self.factor.variable]
        else:
            raise ValueError("Factor on right hand side of group specific term can only be a term.")

        # Notation as in lme4 paper
        Ji = pd.get_dummies(factor).to_numpy()  # note we don't use `drop_first=True`.
        Xi = self.expr.eval(data)
        Zi = linalg.khatri_rao(Ji.T, Xi["value"].T).T
        out = {"type": Xi["type"], "Zi": sparse.coo_matrix(Zi)}
        if Xi["type"] == "categoric":
            out["levels"] = Xi["levels"]
            out["reference"] = Xi["reference"]
        return out
Exemplo n.º 13
0
def recovery_matrix_RKKP(C_tilda, saved_nodes, P, Q, l, p):
    (N, m), n = P.shape, Q.shape[1]

    G = khatri_rao(P.T, Q.T).T
    G_pinv = np.linalg.pinv(G[saved_nodes])

    C_tilda_reshaped = C_tilda.reshape(len(saved_nodes), -1)

    C_rec_reshaped = G_pinv @ C_tilda_reshaped

    C_rec_reshaped = np.vstack([C_i.reshape(l, p) for C_i in C_rec_reshaped])

    C_rec = np.hstack(np.split(C_rec_reshaped, m * n))
    C_rec = np.vstack(np.split(C_rec, m, 1))

    return C_rec
Exemplo n.º 14
0
    def set_data(self, spans_intercept):
        self.expr.set_data(spans_intercept)
        self.factor.set_data(
            True
        )  # Factor is a categorical term that always spans the intercept

        # Obtain group names. These are obtained from the labels of the contrast matrices
        groups = []
        for component in self.factor.components:
            groups.append(component.contrast_matrix.labels)
        self.groups = [":".join(s) for s in list(itertools.product(*groups))]

        Xi, Ji = self.expr.data, self.factor.data
        if Xi.ndim == 1:
            Xi = Xi[:, np.newaxis]
        if Ji.ndim == 1:
            Ji = Ji[:, np.newaxis]

        self.data = linalg.khatri_rao(Ji.T, Xi.T).T  # Zi
        self.kind = self.expr.kind
Exemplo n.º 15
0
    def eval_new_data(self, data):
        """Evaluates the term with new data."""

        # factor uses the same data type that is used in first evaluation.
        factor = data[self.factor.name].astype(self.factor_type)
        Xi = self.expr.eval_new_data(data)
        Ji = pd.get_dummies(factor).to_numpy()
        Zi = linalg.khatri_rao(Ji.T, Xi.T).T
        out = {
            "type": self.expr.metadata["type"],
            "Xi": Xi,
            "Ji": Ji,
            "Zi": sparse.coo_matrix(Zi),
            "groups": factor.cat.categories.tolist(),
        }
        if self.expr._type == "categoric":  # pylint: disable = protected-access
            out["levels"] = self.expr.metadata["levels"]
            out["reference"] = self.expr.metadata["reference"]
            out["encoding"] = self.expr.metadata["encoding"]
        elif self.expr._type == "interaction":  # pylint: disable = protected-access
            out["terms"] = self.expr.metadata["terms"]
        return out
Exemplo n.º 16
0
    def eval_new_data(self, data):
        """Evaluates the term with new data.

        Converts the variable in ``factor`` to the type remembered from the first evaluation and
        produces the design matrix for this grouping, calls ``.eval_new_data()`` on ``self.expr``
        to obtain the design matrix for the ``expr`` side, then computes the design matrix
        corresponding to the group specific effect.

        Parameters
        ----------
        data: pd.DataFrame
            The data frame where variables are taken from.

        Returns
        ----------
        out: dict
            Same rules as in :meth:`eval <GroupSpecificTerm.eval>`.
        """

        # factor uses the same data type that is used in first evaluation.
        factor = data[self.factor.name].astype(self.factor_type)
        Xi = self.expr.eval_new_data(data)
        Ji = pd.get_dummies(factor).to_numpy()
        Zi = linalg.khatri_rao(Ji.T, Xi.T).T
        out = {
            "type": self.expr.metadata["type"],
            "Xi": Xi,
            "Ji": Ji,
            "Zi": sparse.coo_matrix(Zi),
            "groups": factor.cat.categories.tolist(),
        }
        if self.expr._type == "categoric":  # pylint: disable = protected-access
            out["levels"] = self.expr.metadata["levels"]
            out["reference"] = self.expr.metadata["reference"]
            out["encoding"] = self.expr.metadata["encoding"]
        elif self.expr._type == "interaction":  # pylint: disable = protected-access
            out["terms"] = self.expr.metadata["terms"]
        return out
Exemplo n.º 17
0
    def eval(self, data, eval_env, encoding):
        # Note: factor can't be a call or interaction yet.
        if len(self.factor.components) == 1 and isinstance(self.factor.components[0], Variable):
            factor = data[self.factor.name]
            if not hasattr(factor.dtype, "ordered") or not factor.dtype.ordered:
                categories = sorted(factor.unique().tolist())
                type_ = pd.api.types.CategoricalDtype(categories=categories, ordered=True)
                factor = factor.astype(type_)
            else:
                type_ = factor.dtype
            self.factor_type = type_
        else:
            raise ValueError(
                "Factor on right hand side of group specific term must be a single term."
            )

        # Notation as in lme4 paper
        # Note we don't use `drop_first=True` for factor.
        self.expr.set_type(data, eval_env)
        self.expr.set_data(encoding)
        Xi = self.expr.data
        Ji = pd.get_dummies(factor).to_numpy()
        Zi = linalg.khatri_rao(Ji.T, Xi.T).T
        out = {
            "type": self.expr.metadata["type"],
            "Xi": Xi,
            "Ji": Ji,
            "Zi": sparse.coo_matrix(Zi),
            "groups": factor.cat.categories.tolist(),
        }
        if self.expr._type == "categoric":  # pylint: disable = protected-access
            out["levels"] = self.expr.metadata["levels"]
            out["reference"] = self.expr.metadata["reference"]
            out["encoding"] = self.expr.metadata["encoding"]
        elif self.expr._type == "interaction":  # pylint: disable = protected-access
            out["terms"] = self.expr.metadata["terms"]
        return out
Exemplo n.º 18
0
    def test_to_assure_2d_array(self):
        with pytest.raises(ValueError):
            # both arrays are 1-D
            a = array([1, 2, 3])
            b = array([4, 5, 6])
            khatri_rao(a, b)

        with pytest.raises(ValueError):
            # first array is 1-D
            a = array([1, 2, 3])
            b = array([[1, 2, 3], [4, 5, 6]])
            khatri_rao(a, b)

        with pytest.raises(ValueError):
            # second array is 1-D
            a = array([[1, 2, 3], [7, 8, 9]])
            b = array([4, 5, 6])
            khatri_rao(a, b)
Exemplo n.º 19
0
import numpy as np
from scipy.linalg import khatri_rao
import tensorly as ts

#TODO implement in a function

# start off with an example
chi = (np.arange(24) + 1).reshape(2, 3, 4)

r_guess = 3
a_guesses = np.array([np.random.random((np.size(chi, n), r_guess)) for n in np.arange(np.ndim(chi))], dtype=object)
# print(np.shape(a_guesses[0]))

# loop until happy with the result
# set up a while loop here with an error condition
v_total = np.eye(r_guess)
for m in np.arange(np.ndim(chi)):

    for n in np.arange(np.ndim(chi)):
        # do a cascaded multiplication of all the factor matrices
        v_total = np.multiply(v_total, np.matmul(a_guesses[n].T, a_guesses[n]))

    # intermediate result: get the khatri-rao product of all the factor matrices
    a_khatri_prod = a_guesses[np.ndim(chi) - 1] # start with last matrix
    for p in np.arange(np.ndim(chi), -1, -1):
        a_khatri_prod = khatri_rao(a_khatri_prod, a_guesses[p])

    a_guesses[n] = np.matmul(np.matmul(ts.unfold(chi, n), a_khatri_prod), v_total.T)
    # figure out a way to normalize a_guesses

Exemplo n.º 20
0
    def _evaluate_new_data(self, data):
        """Evaluates group specific terms with new data and return a new instance of
        ``GroupEffectsMatrix``.

        This method is intended to be used to obtain design matrices for new data and obtain
        out of sample predictions. Stateful transformations are properly handled if present in any
        of the group specific terms, which means parameters involved in the transformation are not
        overwritten with the new data.


        Parameters
        ----------
        data: pandas.DataFrame
            The data frame where variables are taken from

        Returns
        ----------
        new_instance: GroupEffectsMatrix
            A new instance of ``GroupEffectsMatrix`` whose design matrix is obtained with the values
            in the new data set.
        """
        if not self.evaluated:
            raise ValueError("Can't evaluate new data on unevaluated matrix.")

        new_instance = self.__class__(self.terms)

        start_row = start_col = 0
        Z = []

        for term in self.terms:
            d = term.eval_new_data(data)
            if d["type"] == "categoric":
                levels = d["levels"] if d["encoding"] == "full" else d[
                    "levels"][1:]
                Ji = d["Ji"]
                for idx, level in enumerate(levels):
                    Xi = np.atleast_2d(d["Xi"][:, idx]).T
                    Zi = linalg.khatri_rao(Ji.T, Xi.T).T
                    delta_row, delta_col = Zi.shape
                    Z.append(Zi)
                    term_name = term.to_string(level)
                    # All the info, except from the indexes, is copied.
                    new_instance.terms_info[term_name] = deepcopy(
                        self.terms_info[term_name])
                    new_instance.terms_info[term_name]["idxs"] = (
                        slice(start_row, start_row + delta_row),
                        slice(start_col, start_col + delta_col),
                    )
                    start_row += delta_row
                    start_col += delta_col
            else:
                Zi = d["Zi"]
                delta_row, delta_col = Zi.shape
                Z.append(Zi)
                term_name = term.to_string()
                new_instance.terms_info[term_name] = deepcopy(
                    self.terms_info[term_name])
                new_instance.terms_info[term_name]["idxs"] = (
                    slice(start_row, start_row + delta_row),
                    slice(start_col, start_col + delta_col),
                )
                start_row += delta_row
                start_col += delta_col

        new_instance.data = data
        new_instance.eval_env = self.eval_env

        # Stored in Compressed Sparse Column format
        if Z:
            new_instance.design_matrix = sp.sparse.block_diag(Z).tocsc()

        return new_instance
Exemplo n.º 21
0
    def findGraph(self):
        # dimensions
        L = self.L

        # find the diffusion process
        H = self.findDiffusionProcess().squeeze()

        # get the eigendecomposition of the diffusion process
        U, V = np.linalg.eig(H)
        V_hat = np.zeros((L, L))
        for i in range(self.p):
            if np.sign(V[0, 0, 0]) == np.sign(V[i, 0, 0]):
                V_hat += V[i]
            else:
                V_hat -= V[i]
        V_hat /= self.p
        V = V_hat

        # get U
        U = khatri_rao(V, V)
        Uh, S, Vh = np.linalg.svd(U)
        S[-1] = 0
        S[-2] = 0
        S_mod = np.concatenate((np.diag(S), np.zeros((56, 8))))
        U = np.matmul(np.matmul(Uh, S_mod), Vh)

        # get set D
        D, Dcomp = self.getSetDandDcomp()

        # test the feasibility of the problem
        rank_U_D = np.linalg.matrix_rank(U[D])
        if rank_U_D <= L - 1:
            print("Rank of W_D : {}, Problem is Feasible".format(rank_U_D))
        else:
            print("Rank of W_D : {}, Problem is infeasible".format(rank_U_D))

        # # compute R
        M_full = np.identity(L**2) - np.matmul(U, np.linalg.pinv(U))
        M = M_full[Dcomp]
        # e1 = np.zeros((L, 1))
        # e1[0] = 1
        # l = np.ones((L-1, 1))
        ll = np.zeros((len(Dcomp), 1))
        ll[np.arange(0, self.L - 1, 1)] = 1
        # R = np.hstack((M, np.kron(e1, l)))
        R = np.hstack((M, ll))

        # get b
        b = np.zeros((L**2 + 1, 1))
        b[-1] = 1

        # solve basis pursuit with noisy observations
        beta = 0.001
        A = R.T
        w_Dcomp = np.array(
            l1regls(matrix(A / np.sqrt(beta)), matrix(b / np.sqrt(beta))))

        # get adjacency matrix
        W = self.unvectorize(w_Dcomp.squeeze())

        # w = np.zeros((L*L, ))
        # w[Dcomp] = w_Dcomp.squeeze()
        # W = w.reshape(L, L).T
        W[abs(W) < 1e-5] = 0
        return W
Exemplo n.º 22
0
    def evaluate(self):
        """Evaluates `self.terms` inside the data mask provided by `data` and
        updates `self.design_matrix`.
        """
        start_row = 0
        start_col = 0
        Z = []
        self.terms_info = {}
        for term in self.terms:

            encoding = True
            if not isinstance(term.expr, InterceptTerm):
                for term_ in self.terms:
                    if term_.factor == term.factor and isinstance(
                            term_.expr, InterceptTerm):
                        encoding = False
            d = term.eval(self.data, self.eval_env, encoding)

            if d["type"] == "categoric":
                levels = d["levels"] if d["encoding"] == "full" else d[
                    "levels"][1:]
                for idx, level in enumerate(levels):
                    Xi = np.atleast_2d(d["Xi"][:, idx]).T
                    Ji = d["Ji"]
                    Zi = linalg.khatri_rao(Ji.T, Xi.T).T
                    delta_row = Zi.shape[0]
                    delta_col = Zi.shape[1]
                    Z.append(Zi)
                    term_name = term.to_string(level)
                    self.terms_info[term_name] = {
                        "type":
                        "categoric",
                        "Xi":
                        Xi,
                        "Ji":
                        Ji,
                        "groups":
                        d["groups"],
                        "encoding":
                        d["encoding"],
                        "levels":
                        d["levels"],
                        "reference":
                        d["reference"],
                        "full_names":
                        [f"{term_name}[{group}]" for group in d["groups"]],
                    }
                    self.terms_info[term_name]["idxs"] = (
                        slice(start_row, start_row + delta_row),
                        slice(start_col, start_col + delta_col),
                    )
                    start_row += delta_row
                    start_col += delta_col
            else:
                Zi = d["Zi"]
                delta_row = Zi.shape[0]
                delta_col = Zi.shape[1]
                Z.append(Zi)
                term_name = term.to_string()
                self.terms_info[term_name] = {
                    k: v
                    for k, v in d.items() if k != "Zi"
                }
                self.terms_info[term_name]["idxs"] = (
                    slice(start_row, start_row + delta_row),
                    slice(start_col, start_col + delta_col),
                )
                self.terms_info[term_name][
                    "full_names"] = self.get_term_full_names(term_name)
                start_row += delta_row
                start_col += delta_col

        # Stored in Compressed Sparse Column format
        if Z:
            self.design_matrix = sp.sparse.block_diag(Z).tocsc()
        else:
            self.design_matrix = np.zeros((0, 0))
Exemplo n.º 23
0
    def eval(self, data, eval_env, encoding):
        """Evaluates term.

        First, it evaluates the variable in ``self.factor``, creates an oredered categorical data
        type using its levels, and stores it in ``self.factor_type``. Then, it obtains the
        design matrix for ``self.expr`` to finally produce the matrix for the group specific
        effect.

        The output contains the following information

        * ``"type"``: The type of the ``expr`` term.
        * ``"Xi"``: The design matrix for the ``expr`` term.
        * ``"Ji"``: The design matrix for the ``factor`` term.
        * ``"Zi"``: The design matrix for the group specific term.
        * ``"groups"``: The groups present in ``factor``.

        If ``"type"`` is ``"categoric"``, the output dictionary also contains

        * ``"levels"``: Levels of the term in ``expr``.
        * ``"reference"``: The level taken as baseline.
        * ``"encoding"``: The encoding of the term, either ``"full"`` or ``"reduced"``

        If ``"type"`` is ``"interaction"``, the output dictionary also contains

        * ``"terms"``: Metadata for each of the components in the interaction in ``expr``.

        Parameters
        ----------
        data: pandas.DataFrame
            The data frame where variables are taken from.
        eval_env: EvalEnvironment
            The environment where values and functions are taken from.
        encoding: bool
            Whether to use full or reduced rank encoding when ``expr`` is categoric.

        Returns
        -------
        out: dict
            See above.
        """
        # Evaluate factor and save type to self.factor_type.
        # Note: factor can't be a call or interaction yet.
        if len(self.factor.components) == 1 and isinstance(
                self.factor.components[0], Variable):
            factor = data[self.factor.name]
            if not hasattr(factor.dtype,
                           "ordered") or not factor.dtype.ordered:
                categories = sorted(factor.unique().tolist())
                type_ = pd.api.types.CategoricalDtype(categories=categories,
                                                      ordered=True)
                factor = factor.astype(type_)
            else:
                type_ = factor.dtype
            self.factor_type = type_
        else:
            raise ValueError(
                "Factor on right hand side of group specific term must be a single term."
            )

        # Note we don't use drop_first=True for the factor.
        self.expr.set_type(data, eval_env)
        self.expr.set_data(encoding)
        Xi = self.expr.data
        Ji = pd.get_dummies(factor).to_numpy()
        Zi = linalg.khatri_rao(Ji.T, Xi.T).T
        out = {
            "type": self.expr.metadata["type"],
            "Xi": Xi,
            "Ji": Ji,
            "Zi": sparse.coo_matrix(Zi),
            "groups": factor.cat.categories.tolist(),
        }
        if self.expr._type == "categoric":  # pylint: disable = protected-access
            out["levels"] = self.expr.metadata["levels"]
            out["reference"] = self.expr.metadata["reference"]
            out["encoding"] = self.expr.metadata["encoding"]
        elif self.expr._type == "interaction":  # pylint: disable = protected-access
            out["terms"] = self.expr.metadata["terms"]
        return out
Exemplo n.º 24
0
    def _evaluate(self, data, eval_env):
        """Evaluate group specific terms.

        This evaluates ``self.terms`` inside the data mask provided by ``data`` and the environment
        ``eval_env``. It updates ``self.design_matrix`` with the result from the evaluation of each
        term.

        This method also sets the values of ``self.data`` and ``self.eval_env``. It also populates
        the dictionary ``self.terms_info`` with information related to each term,such as the type,
        the columns and rows they occupy in the design matrix and the names of the columns.

        Parameters
        ----------
        data: pandas.DataFrame
            The data frame where variables are taken from
        eval_env: EvalEnvironment
            The environment where values and functions are taken from.
        """
        self.data = data
        self.eval_env = eval_env
        start_row = 0
        start_col = 0
        Z = []
        self.terms_info = {}
        for term in self.terms:
            encoding = True
            # If both (1|g) and (x|g) are in the model, then the encoding for x is False.
            if not isinstance(term.expr, Intercept):
                for term_ in self.terms:
                    if term_.factor == term.factor and isinstance(
                            term_.expr, Intercept):
                        encoding = False
            d = term.eval(self.data, self.eval_env, encoding)

            if d["type"] == "categoric":
                levels = d["levels"] if d["encoding"] == "full" else d[
                    "levels"][1:]
                for idx, level in enumerate(levels):
                    Xi = np.atleast_2d(d["Xi"][:, idx]).T
                    Ji = d["Ji"]
                    Zi = linalg.khatri_rao(Ji.T, Xi.T).T
                    delta_row, delta_col = Zi.shape
                    Z.append(Zi)
                    term_name = term.to_string(level)
                    self.terms_info[term_name] = {
                        "type":
                        "categoric",
                        "Xi":
                        Xi,
                        "Ji":
                        Ji,
                        "groups":
                        d["groups"],
                        "encoding":
                        d["encoding"],
                        "levels":
                        d["levels"],
                        "reference":
                        d["reference"],
                        "full_names":
                        [f"{term_name}[{group}]" for group in d["groups"]],
                    }
                    self.terms_info[term_name]["idxs"] = (
                        slice(start_row, start_row + delta_row),
                        slice(start_col, start_col + delta_col),
                    )
                    start_row += delta_row
                    start_col += delta_col
            else:
                Zi = d["Zi"]
                delta_row, delta_col = Zi.shape
                Z.append(Zi)
                term_name = term.to_string()
                self.terms_info[term_name] = {
                    k: v
                    for k, v in d.items() if k != "Zi"
                }
                self.terms_info[term_name]["idxs"] = (
                    slice(start_row, start_row + delta_row),
                    slice(start_col, start_col + delta_col),
                )
                self.terms_info[term_name][
                    "full_names"] = self._term_full_names(term_name)
                start_row += delta_row
                start_col += delta_col

        # Stored in Compressed Sparse Column format
        if Z:
            self.design_matrix = sp.sparse.block_diag(Z).tocsc()

        self.evaluated = True