Exemplo n.º 1
0
def monotonicity_prox(tensor, decreasing=False):
    """
    This function projects each column of the input array on the set of arrays so that
          x[1] <= x[2] <= ... <= x[n] (decreasing=False)
                        or
          x[1] >= x[2] >= ... >= x[n] (decreasing=True)
    is satisfied columnwise.

    Parameters
    ----------
    tensor : ndarray
    decreasing : If it is True, function returns columnwise
                 monotone decreasing tensor. Otherwise, returned array
                 will be monotone increasing.
                 Default: True

    Returns
    -------
    ndarray
          A tensor of which columns' are monotonic.

    References
    ----------
    .. [1]: G. Chierchia, E. Chouzenoux, P. L. Combettes, and J.-C. Pesquet
            "The Proximity Operator Repository. User's guide"
    """
    if tl.ndim(tensor) == 1:
        tensor = tl.reshape(tensor, [tl.shape(tensor)[0], 1])
    elif tl.ndim(tensor) > 2:
        raise ValueError(
            "Monotonicity prox doesn't support an input which has more than 2 dimensions."
        )
    tensor_mon = tl.copy(tensor)
    if decreasing:
        tensor_mon = tl.flip(tensor_mon, axis=0)
    row, column = tl.shape(tensor_mon)
    cum_sum = tl.cumsum(tensor_mon, axis=0)
    for j in range(column):
        assisted_tensor = tl.zeros([row, row])
        for i in range(row):
            if i == 0:
                assisted_tensor = tl.index_update(
                    assisted_tensor, tl.index[i, i:], cum_sum[i:, j] /
                    tl.tensor(tl.arange(row - i) + 1, **tl.context(tensor)))
            else:
                assisted_tensor = tl.index_update(
                    assisted_tensor, tl.index[i, i:],
                    (cum_sum[i:, j] - cum_sum[i - 1, j]) /
                    tl.tensor(tl.arange(row - i) + 1, **tl.context(tensor)))
        tensor_mon = tl.index_update(tensor_mon, tl.index[:, j],
                                     tl.max(assisted_tensor, axis=0))
        for i in reversed(range(row - 1)):
            if tensor_mon[i, j] > tensor_mon[i + 1, j]:
                tensor_mon = tl.index_update(tensor_mon, tl.index[i, j],
                                             tensor_mon[i + 1, j])
    if decreasing:
        tensor_mon = tl.flip(tensor_mon, axis=0)
    return tensor_mon
Exemplo n.º 2
0
def test_tucker(monkeypatch):
    """Test for the Tucker decomposition"""
    rng = tl.check_random_state(1234)

    tol_norm_2 = 10e-3
    tol_max_abs = 10e-1
    tensor = tl.tensor(rng.random_sample((3, 4, 3)))
    core, factors = tucker(tensor, rank=None, n_iter_max=200, verbose=True)
    reconstructed_tensor = tucker_to_tensor((core, factors))
    norm_rec = tl.norm(reconstructed_tensor, 2)
    norm_tensor = tl.norm(tensor, 2)
    assert((norm_rec - norm_tensor)/norm_rec < tol_norm_2)

    # Test the max abs difference between the reconstruction and the tensor
    assert(tl.max(tl.abs(reconstructed_tensor - tensor)) < tol_max_abs)

    # Test the shape of the core and factors
    ranks = [2, 3, 1]
    core, factors = tucker(tensor, rank=ranks, n_iter_max=100, verbose=1)
    for i, rank in enumerate(ranks):
        assert_equal(factors[i].shape, (tensor.shape[i], ranks[i]),
                     err_msg="factors[{}].shape={}, expected {}".format(
                         i, factors[i].shape, (tensor.shape[i], ranks[i])))
        assert_equal(tl.shape(core)[i], rank, err_msg="Core.shape[{}]={}, "
                     "expected {}".format(i, core.shape[i], rank))

    # try fixing the core
    factors_init = [tl.copy(f) for f in factors]
    _, factors = tucker(tensor, rank=ranks, init=(core, factors), fixed_factors=[1], n_iter_max=100, verbose=1)
    assert_array_equal(factors[1], factors_init[1])

    # Random and SVD init should converge to a similar solution
    tol_norm_2 = 10e-1
    tol_max_abs = 10e-1

    core_svd, factors_svd = tucker(tensor, rank=[3, 4, 3], n_iter_max=200, init='svd', verbose=1)
    core_random, factors_random = tucker(tensor, rank=[3, 4, 3], n_iter_max=200, init='random', random_state=1234)
    rec_svd = tucker_to_tensor((core_svd, factors_svd))
    rec_random = tucker_to_tensor((core_random, factors_random))
    error = tl.norm(rec_svd - rec_random, 2)
    error /= tl.norm(rec_svd, 2)
    assert_(error < tol_norm_2,
            'norm 2 of difference between svd and random init too high')
    assert_(tl.max(tl.abs(rec_svd - rec_random)) < tol_max_abs,
            'abs norm of difference between svd and random init too high')
    assert_class_wrapper_correctly_passes_arguments(monkeypatch, tucker, Tucker, ignore_args={}, rank=3)
Exemplo n.º 3
0
def hard_thresholding(tensor, number_of_non_zero):
    """
    Proximal operator of the l0 ``norm''
    Keeps greater "number_of_non_zero" elements untouched and sets other elements to zero.

    Parameters
    ----------
    tensor : ndarray
    number_of_non_zero : int

    Returns
    -------
    ndarray
          Thresholded tensor on which the operator has been applied
    """
    tensor_vec = tl.copy(tl.tensor_to_vec(tensor))
    sorted_indices = tl.argsort(tl.argsort(tl.abs(tensor_vec),
                                           axis=0,
                                           descending=True),
                                axis=0)
    return tl.reshape(
        tl.where(sorted_indices < number_of_non_zero, tensor_vec,
                 tl.tensor(0, **tl.context(tensor_vec))), tensor.shape)
Exemplo n.º 4
0
def parafac(tensor, rank, n_iter_max=100, init='svd', svd='numpy_svd',\
            normalize_factors=False, orthogonalise=False,\
            tol=1e-8, random_state=None,\
            verbose=0, return_errors=False,\
            sparsity = None,\
            l2_reg = 0,  mask=None,\
            cvg_criterion = 'abs_rec_error',\
            fixed_modes = [],
            svd_mask_repeats=5,
            linesearch = False):
    """CANDECOMP/PARAFAC decomposition via alternating least squares (ALS)
    Computes a rank-`rank` decomposition of `tensor` [1]_ such that::

        tensor = [|weights; factors[0], ..., factors[-1] |].

    Parameters
    ----------
    tensor : ndarray
    rank  : int
        Number of components.
    n_iter_max : int
        Maximum number of iteration
    init : {'svd', 'random'}, optional
        Type of factor matrix initialization. See `initialize_factors`.
    svd : str, default is 'numpy_svd'
        function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS
    normalize_factors : if True, aggregate the weights of each factor in a 1D-tensor
        of shape (rank, ), which will contain the norms of the factors
    tol : float, optional
        (Default: 1e-6) Relative reconstruction error tolerance. The
        algorithm is considered to have found the global minimum when the
        reconstruction error is less than `tol`.
    random_state : {None, int, np.random.RandomState}
    verbose : int, optional
        Level of verbosity
    return_errors : bool, optional
        Activate return of iteration errors
    mask : ndarray
        array of booleans with the same shape as ``tensor`` should be 0 where
        the values are missing and 1 everywhere else. Note:  if tensor is
        sparse, then mask should also be sparse with a fill value of 1 (or
        True). Allows for missing values [2]_
    cvg_criterion : {'abs_rec_error', 'rec_error'}, optional
       Stopping criterion for ALS, works if `tol` is not None. 
       If 'rec_error',  ALS stops at current iteration if ``(previous rec_error - current rec_error) < tol``.
       If 'abs_rec_error', ALS terminates when `|previous rec_error - current rec_error| < tol`.
    sparsity : float or int
        If `sparsity` is not None, we approximate tensor as a sum of low_rank_component and sparse_component, where low_rank_component = cp_to_tensor((weights, factors)). `sparsity` denotes desired fraction or number of non-zero elements in the sparse_component of the `tensor`.
    fixed_modes : list, default is []
        A list of modes for which the initial value is not modified.
        The last mode cannot be fixed due to error computation.
    svd_mask_repeats: int
        If using a tensor with masked values, this initializes using SVD multiple times to
        remove the effect of these missing values on the initialization.
    linesearch : bool, default is False
        Whether to perform line search as proposed by Bro [3].

    Returns
    -------
    CPTensor : (weight, factors)
        * weights : 1D array of shape (rank, )

          * all ones if normalize_factors is False (default)
          * weights of the (normalized) factors otherwise

        * factors : List of factors of the CP decomposition element `i` is of shape ``(tensor.shape[i], rank)``
        * sparse_component : nD array of shape tensor.shape. Returns only if `sparsity` is not None.

    errors : list
        A list of reconstruction errors at each iteration of the algorithms.

    References
    ----------
    .. [1] T.G.Kolda and B.W.Bader, "Tensor Decompositions and Applications", SIAM 
           REVIEW, vol. 51, n. 3, pp. 455-500, 2009.

    .. [2] Tomasi, Giorgio, and Rasmus Bro. "PARAFAC and missing values." 
           Chemometrics and Intelligent Laboratory Systems 75.2 (2005): 163-180.

    .. [3] R. Bro, "Multi-Way Analysis in the Food Industry: Models, Algorithms, and 
           Applications", PhD., University of Amsterdam, 1998
    """
    rank = validate_cp_rank(tl.shape(tensor), rank=rank)
    
    if orthogonalise and not isinstance(orthogonalise, int):
        orthogonalise = n_iter_max

    if linesearch:
        acc_pow = 2.0 # Extrapolate to the iteration^(1/acc_pow) ahead
        acc_fail = 0 # How many times acceleration have failed
        max_fail = 4 # Increase acc_pow with one after max_fail failure

    weights, factors = initialize_cp(tensor, rank, init=init, svd=svd,
                                 random_state=random_state,
                                 normalize_factors=normalize_factors)

    if mask is not None and init == "svd":
        for _ in range(svd_mask_repeats):
            tensor = tensor*mask + tl.cp_to_tensor((weights, factors), mask=1-mask)

            weights, factors = initialize_cp(tensor, rank, init=init, svd=svd, random_state=random_state, normalize_factors=normalize_factors)

    rec_errors = []
    norm_tensor = tl.norm(tensor, 2)
    Id = tl.eye(rank, **tl.context(tensor))*l2_reg

    if tl.ndim(tensor)-1 in fixed_modes:
        warnings.warn('You asked for fixing the last mode, which is not supported.\n The last mode will not be fixed. Consider using tl.moveaxis()')
        fixed_modes.remove(tl.ndim(tensor)-1)
    modes_list = [mode for mode in range(tl.ndim(tensor)) if mode not in fixed_modes]

    if sparsity:
        sparse_component = tl.zeros_like(tensor)
        if isinstance(sparsity, float):
            sparsity = int(sparsity * np.prod(tensor.shape))
        else:
            sparsity = int(sparsity)

    for iteration in range(n_iter_max):
        if orthogonalise and iteration <= orthogonalise:
            factors = [tl.qr(f)[0] if min(tl.shape(f)) >= rank else f for i, f in enumerate(factors)]

        if linesearch and iteration % 2 == 0:
            factors_last = [tl.copy(f) for f in factors]
            weights_last = tl.copy(weights)

        if verbose > 1:
            print("Starting iteration", iteration + 1)
        for mode in modes_list:
            if verbose > 1:
                print("Mode", mode, "of", tl.ndim(tensor))

            pseudo_inverse = tl.tensor(np.ones((rank, rank)), **tl.context(tensor))
            for i, factor in enumerate(factors):
                if i != mode:
                    pseudo_inverse = pseudo_inverse*tl.dot(tl.conj(tl.transpose(factor)), factor)
            pseudo_inverse += Id

            if not iteration and weights is not None:
                # Take into account init weights
                mttkrp = unfolding_dot_khatri_rao(tensor, (weights, factors), mode)
            else:
                mttkrp = unfolding_dot_khatri_rao(tensor, (None, factors), mode)

            factor = tl.transpose(tl.solve(tl.conj(tl.transpose(pseudo_inverse)),
                                    tl.transpose(mttkrp)))

            if normalize_factors:
                scales = tl.norm(factor, 2, axis=0)
                weights = tl.where(scales==0, tl.ones(tl.shape(scales), **tl.context(factor)), scales)
                factor = factor / tl.reshape(weights, (1, -1))

            factors[mode] = factor

        # Will we be performing a line search iteration
        if linesearch and iteration % 2 == 0 and iteration > 5:
            line_iter = True
        else:
            line_iter = False

        # Calculate the current unnormalized error if we need it
        if (tol or return_errors) and line_iter is False:
            unnorml_rec_error, tensor, norm_tensor = error_calc(tensor, norm_tensor, weights, factors, sparsity, mask, mttkrp)
        else:
            if mask is not None:
                tensor = tensor*mask + tl.cp_to_tensor((weights, factors), mask=1-mask)

        # Start line search if requested.
        if line_iter is True:
            jump = iteration ** (1.0 / acc_pow)

            new_weights = weights_last + (weights - weights_last) * jump
            new_factors = [factors_last[ii] + (factors[ii] - factors_last[ii])*jump for ii in range(tl.ndim(tensor))]

            new_rec_error, new_tensor, new_norm_tensor = error_calc(tensor, norm_tensor, new_weights, new_factors, sparsity, mask)

            if (new_rec_error / new_norm_tensor) < rec_errors[-1]:
                factors, weights = new_factors, new_weights
                tensor, norm_tensor = new_tensor, new_norm_tensor
                unnorml_rec_error = new_rec_error
                acc_fail = 0

                if verbose:
                    print("Accepted line search jump of {}.".format(jump))
            else:
                unnorml_rec_error, tensor, norm_tensor = error_calc(tensor, norm_tensor, weights, factors, sparsity, mask, mttkrp)
                acc_fail += 1

                if verbose:
                    print("Line search failed for jump of {}.".format(jump))

                if acc_fail == max_fail:
                    acc_pow += 1.0
                    acc_fail = 0

                    if verbose:
                        print("Reducing acceleration.")

        rec_error = unnorml_rec_error / norm_tensor
        rec_errors.append(rec_error)

        if tol:

            if iteration >= 1:
                rec_error_decrease = rec_errors[-2] - rec_errors[-1]
                
                if verbose:
                    print("iteration {}, reconstruction error: {}, decrease = {}, unnormalized = {}".format(iteration, rec_error, rec_error_decrease, unnorml_rec_error))

                if cvg_criterion == 'abs_rec_error':
                    stop_flag = abs(rec_error_decrease) < tol
                elif cvg_criterion == 'rec_error':
                    stop_flag =  rec_error_decrease < tol
                else:
                    raise TypeError("Unknown convergence criterion")
                
                if stop_flag:
                    if verbose:
                        print("PARAFAC converged after {} iterations".format(iteration))
                    break
                    
            else:
                if verbose:
                    print('reconstruction error={}'.format(rec_errors[-1]))

    cp_tensor = CPTensor((weights, factors))
    
    if sparsity:
        sparse_component = sparsify_tensor(tensor -\
                                           cp_to_tensor((weights, factors)),\
                                           sparsity)
        cp_tensor = (cp_tensor, sparse_component)

    if return_errors:
        return cp_tensor, rec_errors
    else:
        return cp_tensor
Exemplo n.º 5
0
def fista(UtM,
          UtU,
          x=None,
          n_iter_max=100,
          non_negative=True,
          sparsity_coef=0,
          lr=None,
          tol=10e-8):
    """
    Fast Iterative Shrinkage Thresholding Algorithm (FISTA)

    Computes an approximate (nonnegative) solution for Ux=M linear system.

    Parameters
    ----------
    UtM : ndarray
        Pre-computed product of the transposed of U and M
    UtU : ndarray
        Pre-computed product of the transposed of U and U
    x : init
       Default: None
    n_iter_max : int
        Maximum number of iteration
        Default: 100
    non_negative : bool, default is False
                   if True, result will be non-negative
    lr : float
        learning rate
        Default : None
    sparsity_coef : float or None
    tol : float
        stopping criterion

    Returns
    -------
    x : approximate solution such that Ux = M

    Notes
    -----
    We solve the following problem :math: `1/2 ||m - Ux ||_2^2 + \\lambda |x|_1`

    Reference
    ----------
    [1] : Beck, A., & Teboulle, M. (2009). A fast iterative
          shrinkage-thresholding algorithm for linear inverse problems.
          SIAM journal on imaging sciences, 2(1), 183-202.
    """
    if sparsity_coef is None:
        sparsity_coef = 0

    if x is None:
        x = tl.zeros(tl.shape(UtM), **tl.context(UtM))
    if lr is None:
        lr = 1 / (tl.partial_svd(UtU)[1][0])
    # Parameters
    momentum_old = tl.tensor(1.0)
    norm_0 = 0.0
    x_update = tl.copy(x)

    for iteration in range(n_iter_max):
        if isinstance(UtU, list):
            x_gradient = -UtM + tl.tenalg.multi_mode_dot(
                x_update, UtU, transpose=False) + sparsity_coef
        else:
            x_gradient = -UtM + tl.dot(UtU, x_update) + sparsity_coef

        if non_negative is True:
            x_gradient = tl.where(lr * x_gradient < x_update, x_gradient,
                                  x_update / lr)

        x_new = x_update - lr * x_gradient
        momentum = (1 + tl.sqrt(1 + 4 * momentum_old**2)) / 2
        x_update = x_new + ((momentum_old - 1) / momentum) * (x_new - x)
        momentum_old = momentum
        x = tl.copy(x_new)
        norm = tl.norm(lr * x_gradient)
        if iteration == 1:
            norm_0 = norm
        if norm < tol * norm_0:
            break
    return x
Exemplo n.º 6
0
def unimodality_prox(tensor):
    """
    This function projects each column of the input array on the set of arrays so that
          x[1] <= x[2] <= x[j] >= x[j+1]... >= x[n]
    is satisfied columnwise.

    Parameters
    ----------
    tensor : ndarray

    Returns
    -------
    ndarray
         A tensor of which columns' distribution are unimodal.

    References
    ----------
    .. [1]: Bro, R., & Sidiropoulos, N. D. (1998). Least squares algorithms under
            unimodality and non‐negativity constraints. Journal of Chemometrics:
            A Journal of the Chemometrics Society, 12(4), 223-247.
    """
    if tl.ndim(tensor) == 1:
        tensor = tl.vec_to_tensor(tensor, [tl.shape(tensor)[0], 1])
    elif tl.ndim(tensor) > 2:
        raise ValueError(
            "Unimodality prox doesn't support an input which has more than 2 dimensions."
        )

    tensor_unimodal = tl.copy(tensor)
    monotone_increasing = tl.tensor(monotonicity_prox(tensor),
                                    **tl.context(tensor))
    monotone_decreasing = tl.tensor(monotonicity_prox(tensor, decreasing=True),
                                    **tl.context(tensor))
    # Next line finds mutual peak points
    values = tl.tensor(
        tl.to_numpy((tensor - monotone_decreasing >= 0)) * tl.to_numpy(
            (tensor - monotone_increasing >= 0)), **tl.context(tensor))

    sum_inc = tl.where(values == 1,
                       tl.cumsum(tl.abs(tensor - monotone_increasing), axis=0),
                       tl.tensor(0, **tl.context(tensor)))
    sum_inc = tl.where(values == 1,
                       sum_inc - tl.abs(tensor - monotone_increasing),
                       tl.tensor(0, **tl.context(tensor)))
    sum_dec = tl.where(
        tl.flip(values, axis=0) == 1,
        tl.cumsum(tl.abs(
            tl.flip(tensor, axis=0) - tl.flip(monotone_decreasing, axis=0)),
                  axis=0), tl.tensor(0, **tl.context(tensor)))
    sum_dec = tl.where(
        tl.flip(values, axis=0) == 1, sum_dec -
        tl.abs(tl.flip(tensor, axis=0) - tl.flip(monotone_decreasing, axis=0)),
        tl.tensor(0, **tl.context(tensor)))

    difference = tl.where(values == 1, sum_inc + tl.flip(sum_dec, axis=0),
                          tl.max(sum_inc + tl.flip(sum_dec, axis=0)))
    min_indice = tl.argmin(tl.tensor(difference), axis=0)

    for i in range(len(min_indice)):
        tensor_unimodal = tl.index_update(
            tensor_unimodal, tl.index[:int(min_indice[i]), i],
            monotone_increasing[:int(min_indice[i]), i])
        tensor_unimodal = tl.index_update(
            tensor_unimodal, tl.index[int(min_indice[i] + 1):, i],
            monotone_decreasing[int(min_indice[i] + 1):, i])
    return tensor_unimodal
Exemplo n.º 7
0
def admm(UtM,
         UtU,
         x,
         dual_var,
         n_iter_max=100,
         n_const=None,
         order=None,
         non_negative=None,
         l1_reg=None,
         l2_reg=None,
         l2_square_reg=None,
         unimodality=None,
         normalize=None,
         simplex=None,
         normalized_sparsity=None,
         soft_sparsity=None,
         smoothness=None,
         monotonicity=None,
         hard_sparsity=None,
         tol=1e-4):
    """
    Alternating direction method of multipliers (ADMM) algorithm to minimize a quadratic function under convex constraints.

    Parameters
    ----------
    UtM: ndarray
       Pre-computed product of the transposed of U and M.
    UtU: ndarray
       Pre-computed product of the transposed of U and U.
    x: init
       Default: None
    dual_var : ndarray
               Dual variable to update x
    n_iter_max : int
        Maximum number of iteration
        Default: 100
    n_const : int
        Default : None
    order : int
        Default : None
    non_negative : bool or dictionary
        This constraint is clipping negative values to '0'. If it is True non-negative constraint is applied to all modes.
    l1_reg : float or list or dictionary, optional
    l2_reg : float or list or dictionary, optional
    l2_square : float or list or dictionary, optional
    unimodality : bool or dictionary, optional
        If it is True unimodality constraint is applied to all modes.
    normalize : bool or dictionary, optional
        This constraint divides all the values by maximum value of the input array. If it is True normalize constraint
        is applied to all modes.
    simplex : float or list or dictionary, optional
    normalized_sparsity : float or list or dictionary, optional
    soft_sparsity : float or list or dictionary, optional
    smoothness : float or list or dictionary, optional
    monotonicity : bool or dictionary, optional
    hard_sparsity : float or list or dictionary, optional
    tol : float

    Returns
    -------
    x : Updated ndarray
    x_split : Updated ndarray
    dual_var : Updated ndarray

    Notes
    -----
    ADMM solves the convex optimization problem :math:`\\min_ f(x) + g(z)` where :math: A(x_split) + Bx = c.

    Following updates are iterated to solve the problem::

    .. math::
        \\begin{equation}
            x_split = argmin_(x_split) f(x_split) + (rho/2)||A(x_split) + Bx - c||_2^2
            x = argmin_x g(x) + (rho/2)||A(x_split) + Bx - c||_2^2
            dual_var = dual_var + (Ax + B(x_split) - c)
        \\end{equation}

    where rho is a constant defined by the user.

    Let us define a least square problem such as :math:`\\||Ux - M||^2 + r(x)`.

    ADMM can be adapted to this least square problem as following::

    .. math::
        \\begin{equation}
            x_split = (UtU + rho\times I)\times(UtM + rho\times(x + dual_var)^T)
            x = argmin r(x) + (rho/2)||x - x_split^T + dual_var||_2^2
            dual_var = dual_var + x - x_split^T
        \\end{equation}
    where r is the regularization operator. Here, x can be updated by using proximity operator
    of :math:`x_split^T - dual_var`.

    References
    ----------
    .. [1] Huang, Kejun, Nicholas D. Sidiropoulos, and Athanasios P. Liavas.
           "A flexible and efficient algorithmic framework for constrained matrix and tensor factorization."
           IEEE Transactions on Signal Processing 64.19 (2016): 5052-5065.
    """
    rho = tl.trace(UtU) / tl.shape(x)[1]
    for iteration in range(n_iter_max):
        x_old = tl.copy(x)
        x_split = tl.solve(tl.transpose(UtU + rho * tl.eye(tl.shape(UtU)[1])),
                           tl.transpose(UtM + rho * (x + dual_var)))
        x = proximal_operator(tl.transpose(x_split) - dual_var,
                              non_negative=non_negative,
                              l1_reg=l1_reg,
                              l2_reg=l2_reg,
                              l2_square_reg=l2_square_reg,
                              unimodality=unimodality,
                              normalize=normalize,
                              simplex=simplex,
                              normalized_sparsity=normalized_sparsity,
                              soft_sparsity=soft_sparsity,
                              smoothness=smoothness,
                              monotonicity=monotonicity,
                              hard_sparsity=hard_sparsity,
                              n_const=n_const,
                              order=order)
        if n_const is None:
            x = tl.transpose(tl.solve(tl.transpose(UtU), tl.transpose(UtM)))
            return x, x_split, dual_var
        dual_var = dual_var + x - tl.transpose(x_split)

        dual_residual = x - tl.transpose(x_split)
        primal_residual = x - x_old

        if tl.norm(dual_residual) < tol * tl.norm(x) and tl.norm(
                primal_residual) < tol * tl.norm(dual_var):
            break
    return x, x_split, dual_var
Exemplo n.º 8
0
def one_ntd_step(tensor, ranks, in_core, in_factors, norm_tensor,
                 sparsity_coefficients, fixed_modes, normalize, mode_core_norm, 
                 alpha=0.5, delta=0.01):
    """
    One pass of Hierarchical Alternating Least Squares update along all modes,
    and gradient update on the core,
    which decreases reconstruction error in Nonnegative Tucker Decomposition.

    Update the factors by solving a least squares problem per mode, as described in [1].

    Note that the unfolding order is the one described in [2], which is different from [1].

    This function is strictly superior to a least squares solver ran on the
    matricized problems min_X ||Y - AX||_F^2 since A is structured as a
    Kronecker product of the other factors/core.

    Tensors are manipulated with the tensorly toolbox [3].

    Parameters
    ----------
    unfolded_tensors: list of array
        The spectrogram tensor, unfolded according to all its modes.
    ranks: list of integers
        Ranks for eac factor of the decomposition.
    in_core : tensorly tensor
        Current estimates of the core
    in_factors: list of array
        Current estimates for the factors of this NTD.
        The value of factor[update_mode] will be updated using a least squares update.
        The values in in_factors are not modified.
    norm_tensor : float
        The Frobenius norm of the input tensor
    sparsity_coefficients: list of float (as much as the number of modes + 1 for the core)
        The sparsity coefficients on each factor and on the core respectively.
    fixed_modes: list of integers (between 0 and the number of modes + 1 for the core)
        Has to be set not to update a factor, taken in the order of modes and lastly on the core.
    normalize: list of boolean (as much as the number of modes + 1 for the core)
        A boolean whereas the factors need to be normalized.
        The normalization is a l_2 normalization on each of the rank components
        (For the factors, each column will be normalized, ie each atom of the dimension of the current rank).
    mode_core_norm: integer or None
        The mode on which normalize the core, or None if normalization shouldn't be enforced.
        Will only be useful if the last element of the previous "normalise" argument is set to True.
        Indexes of the modes start at 0.
        Default: None
    alpha : positive float
        Ratio between outer computations and inner loops. Typically set to 0.5 or 1.
        Set to +inf in the deterministic mode, as it depends on runtime.
        Default: 0.5
    delta : float in [0,1]
        Early stop criterion, while err_k > delta*err_0. Set small for
        almost exact nnls solution, or larger (e.g. 1e-2) for inner loops
        of a NTD computation.
        Default: 0.01


    Returns
    -------
    core: tensorly tensor
        The core tensor linking the factors of the decomposition
    factors: list of factors
        An array containing all the factors computed with the NTD
    cost_fct_val:
        The value of the cost function at this step,
        normalized by the squared norm of the original tensor.
    
    References
    ----------
    [1] Tamara G Kolda and Brett W Bader. "Tensor decompositions and applications",
    SIAM review 51.3 (2009), pp. 455{500.

    [2] Jeremy E Cohen. "About notations in multiway array processing",
    arXiv preprint arXiv:1511.01306, (2015).

    [3] J. Kossai et al. "TensorLy: Tensor Learning in Python",
    arxiv preprint (2018)
    """

    # Avoiding errors
    for fixed_value in fixed_modes:
        sparsity_coefficients[fixed_value] = None

    # Copy
    core = in_core.copy()
    factors = in_factors.copy()

    # Generating the mode update sequence
    modes_list = [mode for mode in range(tl.ndim(tensor)) if mode not in fixed_modes]

    for mode in modes_list:

        #unfolded_core = tl.base.unfold(core, mode)

        tic = time.time()

        # UtU
        # First, element-wise products
        # some computations could be reused but the gain is small.
        elemprod = factors.copy()
        for i, factor in enumerate(factors):
            if i != mode:
                elemprod[i] = tl.dot(tl.conj(tl.transpose(factor)), factor)
        # Second, the multiway product with core G
        temp = tl.tenalg.multi_mode_dot(core, elemprod, skip=mode)
        # this line can be computed with tensor contractions
        con_modes = [i for i in range(tl.ndim(tensor)) if i != mode]
        UtU = tl.tenalg.contract(temp, con_modes, core, con_modes)
        #UtU = unfold(temp, mode)@tl.transpose(unfold(core, mode))

        # UtM
        # First, the contraction of data with other factors
        temp = tl.tenalg.multi_mode_dot(tensor, factors, skip=mode, transpose = True)
        # again, computable by tensor contractions
        #MtU = unfold(temp, mode)@tl.transpose(unfold(core, mode))
        MtU = tl.tenalg.contract(temp, con_modes, core, con_modes)
        UtM = tl.transpose(MtU)


        # Computing the Kronekcer product
        #kron = tl.tenalg.kronecker(factors, skip_matrix = mode, reverse = False)
        #kron_core = tl.dot(kron, tl.transpose(unfolded_core))
        #rhs = tl.dot(unfolded_tensors[mode], kron_core)

        # Maybe suboptimal
        #cross = tl.dot(tl.transpose(kron_core), kron_core)

        timer = time.time() - tic

        # Call the hals resolution with nnls, optimizing the current mode
        factors[mode] = tl.transpose(nnls.hals_nnls_acc(UtM, UtU, tl.transpose(factors[mode]),
               maxiter=100, atime=timer, alpha=alpha, delta=delta,
               sparsity_coefficient = sparsity_coefficients[mode], normalize = normalize[mode])[0])

    #refolded_tensor = tl.base.fold(unfolded_tensors[0], 0, tensor_shape)

    # Core update
    #all_MtX = tl.tenalg.multi_mode_dot(tensor, factors, transpose = True)
    # better implementation: reuse the computation of temp !
    # Also reuse elemprod form last update
    all_MtX = tl.tenalg.mode_dot(temp, tl.transpose(factors[modes_list[-1]]), modes_list[-1])
    all_MtM = tl.copy(elemprod)
    all_MtM[modes_list[-1]] = factors[modes_list[-1]].T@factors[modes_list[-1]]

    #all_MtM = np.array([fac.T@fac for fac in factors])

    # Projected gradient
    gradient_step = 1
    #print(f"factors[modes_list[-1]]: {factors[modes_list[-1]]}")        

    #print(f"all_MtM: {all_MtM}")
    for MtM in all_MtM:
        #print(f"MtM: {MtM}")
        gradient_step *= 1/(scipy.sparse.linalg.svds(MtM, k=1)[1][0])

    gradient_step = round(gradient_step, 6) # Heurisitc, to avoid consecutive imprecision

    cnt = 1
    upd_0 = 0
    upd = 1

    if sparsity_coefficients[-1] is None:
        sparse = 0
    else:
        sparse = sparsity_coefficients[-1]

    # TODO: dynamic stopping criterion
    # Maybe: try fast gradient instead of gradient
    while cnt <= 300 and upd>= delta * upd_0:
        gradient = - all_MtX + tl.tenalg.multi_mode_dot(core, all_MtM, transpose = False) + sparse * tl.ones(core.shape)

        # Proposition of reformulation for error computations
        delta_core = np.minimum(gradient_step*gradient, core)
        core = core - delta_core
        upd = tl.norm(delta_core)
        if cnt == 1:
            upd_0 = upd

        cnt += 1

    if normalize[-1]:
        unfolded_core = tl.unfold(core, mode_core_norm)
        for idx_mat in range(unfolded_core.shape[0]):
            if tl.norm(unfolded_core[idx_mat]) != 0:
                unfolded_core[idx_mat] = unfolded_core[idx_mat] / tl.norm(unfolded_core[idx_mat], 2)
        core = tl.fold(unfolded_core, mode_core_norm, core.shape)

    # Adding the l1 norm value to the reconstruction error
    sparsity_error = 0
    for index, sparse in enumerate(sparsity_coefficients):
        if sparse:
            if index < len(factors):
                sparsity_error += 2 * (sparse * np.linalg.norm(factors[index], ord=1))
            elif index == len(factors):
                sparsity_error += 2 * (sparse * tl.norm(core, 1))
            else:
                raise NotImplementedError("TODEBUG: Too many sparsity coefficients, should have been raised before.")

    rec_error = norm_tensor ** 2 - 2*tl.tenalg.inner(all_MtX, core) + tl.tenalg.inner(tl.tenalg.multi_mode_dot(core, all_MtM, transpose = False), core)
    cost_fct_val = (rec_error + sparsity_error) / (norm_tensor ** 2)

    #exhaustive_rec_error = (tl.norm(tensor - tl.tenalg.multi_mode_dot(core, factors, transpose = False), 2) + sparsity_error) / norm_tensor
    #print("diff: " + str(rec_error - exhaustive_rec_error))
    #print("max" + str(np.amax(factors[2])))
    return core, factors, cost_fct_val #  exhaustive_rec_error
Exemplo n.º 9
0
def gcp(X, R, type='normal', func=None, grad=None, lower=None,\
        opt='lbfgsb', mask=None, maxiters=1000, \
        init='random', printitn=10, state=None, factr=1e7, pgtol=1e-4, \
        fsamp=None, gsamp=None, oversample=1.1, sampler='uniform', \
        fsampler=None, rate=1e-3, decay=0.1, maxfails=1, epciters=1000, \
        festtol=-math.inf, beta1=0.9, beta2=0.999, epsilon=1e-8):
    """Generalized CANDECOMP/PARAFAC (GCP) decomposition via all-at-once optimization (OPT) [1]
    Computes a rank-'R' decomposition of 'tensor' such that::

      tensor = [|weights; factors[0], ..., factors[-1] |].

    GCP-OPT allows the use of a variety of statistically motivated loss functions
    suited to the data held in a tensor (i.e. continuous, discrete, binary, etc)

    Parameters
    ----------
    X : ndarray
        Tensor to factorize
        **COMING SOON**
        Sparse tensor support
    R : int
        Rank of decomposition (Number of components).
    type : str,
        Type of objective function used
        Options include:
            'normal' or 'gaussian'          - Gaussian for real-valued data (DEFAULT)
            'binary' or 'bernoulli-odds'    - Bernoulli w/ odds link for binary data
            'bernoulli-logit'               - Bernoulli w/ logit link for binary data
            'count' or 'poisson'            - Poisson for count data
            'poisson-log'                   - Poisson w/ log link for count data
            'rayleigh'                      - Rayleigh distribution for real-valued data
            'gamma'                         - Gamma distribution for non-negative real-valued data
        **COMING SOON**:
            'huber (DELTA)                  - Similar to Gaussian, for real-valued data
            'negative-binomial (r)'         - Negative binomial for count data
            'beta (BETA)'                   - Beta divergence for non-negative real-valued data
            'user-specified'                - Customized objective function provided by user
    func: lambda function
        User specified custom objective function, eg. lambda x, m: (m-x)**2
    grad: lambda function
        User specified custom gradient function, eg. lambda x, m: 2*(m-x)
    lower: 0 or -inf
        Lower bound for custom objective/gradient
    opt : str
        Optimization method
        Options include:
            'lbfgsb'    - Bound-constrained limited-memory BFGS
            'sgd'       - Stochastic gradient descent (SGD)
            **COMING SOON**
            'adam'      - Momentum-based SGD method
            'adagrad'   - Adaptive gradient algorithm, well suited for sparse data
        If 'tensor' is dense, all 4 options can be used, 'lbfgsb' by default.
        **COMING SOON** - Sparse format support
        If 'tensor' is sparse, only 'sgd', 'adam' and 'adagrad' can be used, 'adam' by
        default.
        Each method has specific parameters, see documentation
    mask : ndarray
        Specifies a mask, 0's for missing/incomplete entries, 1's elsewhere, with
        the same shape as 'tensor'.
        **COMING SOON** - Missing/incomplete data simulation.
    maxiters : int
        Maximum number of outer iterations, 1000 by default.
    init : {'random', 'svd', cptensor}
        Initialization for factor matrices, 'random' by default.
        Options include:
            'random'    - random initialization from a uniform distribution on [0,1)
            'svd'       - initialize the `m`th factor matrix using the `rank` left
                          singular vectors of the `m`th unfolding of the input tensor.
            cptensor    - initialization provided by user.  NOTE: weights are pulled
                          in the last factor and then the weights are set to "1" for
                          the output tensor.
        Initializations all result in a cptensor where the weights are one.
    printitn : int
        Print every n iterations; 0 for no printing, 10 by default.
    state : {None, int, np.random.RandomState}
        Seed for reproducable random number generation
    factr : float
        (L-BFGS-B parameter)
        Tolerance on the change of objective values. Defaults to 1e7.
    pgtol : float
        (L-BFGS-B parameter)
        Projected gradient tolerance.  Defaults to 1e-5
    sampler : {uniform, stratified, semi-stratified}
        Type of sampling to use for stochastic gradient (SGD/ADAM/ADAGRAD).
        Defaults to 'uniform' for dense tensors.
        Defaults to 'stratified' for sparse tensors.
        Options include:
            'uniform'           - Uniform random sampling
            **COMING SOON**
            'stratified'        - Stratified sampling, targets sparse data. Zero and
                                  nonzero values sampled separately.
            'semi-stratified'   - Similar to stratified sampling, but is more
                                  computationally efficient (See papers referenced).
    gsamp : int
        Number of samples for stochastic gradient (SGD/ADAM/ADAGRAD parameter).
        Generally set to be O(sum(shape)*R).
        **COMING SOON**
        For stratified or semi-stratified, this may be two numbers:
            - the number of nnz samples
            - the number of zero samples.
        If only one number is specified, then this value is used for both nnzs and
        zeros (total number of samples is 2x specified value in this case).
    fsampler : {'uniform', 'stratified', custom}
        Type of sampling for estimating objective function (SGD/ADAM/ADAGRAD parameter).
        Options include:
            'uniform'       - Uniform random sampling
            **COMING SOON**
            'stratified'    - Stratified sampling, targets sparse data. Zero and
                              nonzero values sampled separately.
            custom          - User-defined sampler (lambda function). Custom option
                              is primarily useful in reusing sampled elements across
                              multiple tests.
    fsamp : int
        (SGD/ADAM/ADAGRAD parameter)
        Number of samples to estimate objective function.
        This should generally be somewhat large since we want this sample to generate a
        reliable estimate of the true function value.
    oversample : float
        (Stratified sampling parameter)
        Factor to oversample when implicitly sampling zeros in the sparse case.
        Defaults to 1.1. Only adjust for very small tensors.
    rate : float
        (SGD/ADAM parameter)
        Initial learning rate. Defaults to 1e-3.
    decay : float
        (SGD/ADAM parameter)
        Amount to decrease learning rate when progress stagnates, i.e. no change in
        objective function between epochs.  Defaults to 0.1.
    maxfails : int
        (SGD/ADAM parameter)
        Number of times to decrease the learning rate.
        Defaults to 1, may be set to zero.
    epciters : int
        (SGD/ADAM parameter)
        Iterations per epoch. Defaults to 1000.
    festtol : float
        (SGD/ADAM parameter)
        Quit estimation of function if it goes below this level.
        Defaults to -inf.
    beta1 : float
        (ADAM parameter)    - generally doesn't need to be changed
        Defaults to 0.9
    beta2 : float
        (ADAM parameter)    - generally doesn't need to be changed
        Defaults to 0.999
    epsilon : float
        (ADAM parameter)    - generally doesn't need to be changed
        Defaults to 1e-8

    Returns
    -------
    Mfin : CPTensor
        Canonical polyadic decomposition of input tensor X

    Reference
    ---------
    [1] D. Hong, T. G. Kolda, J. A. Duersch, Generalized Canonical
        Polyadic Tensor Decomposition, SIAM Review, 62:133-163, 2020,
        https://doi.org/10.1137/18M1203626
    [2] T. G. Kolda, D. Hong, Stochastic Gradients for Large-Scale Tensor
        Decomposition. SIAM J. Mathematics of Data Science, 2:1066-1095,
        2020, https://doi.org/10.1137/19m1266265

    """
    # Timer - Setup (outside optimization)
    start_setup0 = time.perf_counter()

    # Initial setup
    nd = tl.ndim(X)
    sz = tl.shape(X)
    tsz = X.size
    X_context = tl.context(X)
    vecsz = 0
    for i in range(nd):
        # tsz *= sz[i]
        vecsz += sz[i]
    vecsz *= R
    W = mask

    # Random set-up
    if state is not None:
        state = tl.check_random_state(state)

    # capture stats(nnzs, zeros, missing)
    nnonnzeros = 0
    X = tl.tensor_to_vec(X)
    for i in X:
        if i > 0:
            nnonnzeros += 1
    X = tl.reshape(X, sz)
    nzeros = tsz - nnonnzeros
    nmissing = 0
    if W is not None:
        W = tl.tensor_to_vec(W)
        for i in range(tl.shape(W)[0]):
            if W[i] > 0: nmissing += 1  # TODO: is this right??
        W = tl.reshape(W, sz)

    # Dictionary for storing important information regarding the decomposition problem
    info = {}
    info['tsz'] = tsz
    info[
        'nmissing'] = 0  # TODO: revisit once missing value functionality incorporated
    info['nnonnzeros'] = nnonnzeros
    info[
        'nzeros'] = nzeros  # TODO: revisit once missing value functionality incorporated

    # Set up function, gradient, and bounds
    fh, gh, lb = validate_type(type, X)
    info['type'] = type
    info['fh'] = fh
    info['gh'] = gh
    info['lb'] = lb

    # initialize CP-tensor and make a copy to work with so as to have the starting guess
    M0 = initialize_cp(X, R, init=init, random_state=state)
    wghts0 = tl.copy(M0[0])
    fcts0 = []
    for i in range(nd):
        f = tl.copy(M0[1][i])
        fcts0.append(f)
    M = CPTensor((wghts0, fcts0))

    # Lambda weights are assumed to be all ones throughout, check initial guess satisfies assumption
    if not tl.all(M[0]):
        print("Initialization of CP tensor has failed (lambda weight(s) != 1.")
        sys.exit(1)

    # check optimization method
    if validate_opt(opt):
        print("Choose optimization method from: {lbfgsb, sgd}")
        sys.exit(1)
    use_stoc = False
    if opt != 'lbfgsb':
        use_stoc = True
    info['opt'] = opt

    # set up for stochastic optimization (e.g. sgd, adam, adagrad)
    if use_stoc:
        # set up fsampler, gsampler ---> uniform sampling only for now
        # TODO : add stratified, semi-stratified and user-specified sampling options
        if not sampler == "uniform":
            print(
                "Only uniform sampling currently supported for stochastic optimization."
            )
            sys.exit(1)
        fsampler_type = sampler
        gsampler_type = sampler

        # setup fsampler
        f_samp = fsamp
        if f_samp == None:
            upper = np.maximum(math.ceil(tsz / 10), 10 ^ 6)
            f_samp = np.minimum(upper, tsz)

        # set up lambda function/function handle for uniform sampling
        fsampler = lambda: tl_sample_uniform(X, f_samp)
        fsampler_str = "{} with {} samples".format(fsampler_type, f_samp)

        # setup gsampler
        g_samp = gsamp
        if g_samp == None:
            upper = np.maximum(1000, math.ceil(10 * tsz / maxiters))
            g_samp = np.minimum(upper, tsz)

        # setup lambda function/function handle for uniform sampling
        gsampler = lambda: tl_sample_uniform(X, g_samp)
        gsampler_str = "{} with {} samples".format(gsampler_type, g_samp)

        # capture the info
        info['fsampler'] = fsampler_str
        info['gsampler'] = gsampler_str
        info['fsamp'] = f_samp
        info['gsamp'] = g_samp

    time_setup0 = time.perf_counter() - start_setup0

    # Welcome message
    if printitn > 0:
        print("GCP-OPT-{} (Generalized CP Tensor Decomposition)".format(opt))
        print("------------------------------------------------")
        print("Tensor size:\t\t\t\t{} ({} total entries)".format(sz, tsz))
        if nmissing > 0:
            print("Missing entries: {} ({})".format(nmissing,
                                                    100 * nmissing / tsz))
        print("Generalized function type:\t{}".format(type))
        print("Objective function:\t\t\t{}".format(
            inspect.getsource(fh).strip()))
        print("Gradient function:\t\t\t{}".format(
            inspect.getsource(gh).strip()))
        print("Lower bound of factors:\t\t{}".format(lb))
        print("Optimization method:\t\t{}".format(opt))
        if use_stoc:
            print("Max iterations (epochs): {}".format(maxiters))
            print("Iterations per epoch: {}".format(epciters))
            print("Learning rate / decay / maxfails: {} {} {}".format(
                rate, decay, maxfails))
            print("Function Sampler: {}".format(fsampler_str))
            print("Gradient Sampler: {}".format(gsampler_str))
        else:
            print("Max iterations:\t\t\t\t{}".format(maxiters))
            print("Projected gradient tol:\t\t{}\n".format(pgtol))

    # Make like a zombie and start decomposing
    Mfin = None
    # L-BFGS-B optimization
    if opt == 'lbfgsb':
        # Timer - Setup (inside optimization)
        start_setup1 = time.perf_counter()

        # set up bounds for l-bfgs-b if lb = 0
        bounds = None
        if lb == 0:
            lb = tl.zeros(tsz)
            ub = math.inf * tl.ones(tsz)
        fcn = lambda x: tl_gcp_fg(vec2factors(x, sz, R, X_context), X, fh, gh)
        m = factors2vec(M[1])

        # capture params for l-bfgs-b
        lbfgsb_params = {}
        lbfgsb_params['x0'] = factors2vec(M0.factors)
        lbfgsb_params['printEvery'] = printitn
        lbfgsb_params['maxIts'] = maxiters
        lbfgsb_params['maxTotalIts'] = maxiters * 10
        lbfgsb_params['factr'] = factr
        lbfgsb_params['pgtol'] = pgtol

        time_setup1 = time.perf_counter() - start_setup1

        if printitn > 0:
            print("Begin main loop")

        # Timer - Main operation
        start_main = time.perf_counter()
        x, f, info_dict = fmin_l_bfgs_b(fcn, m, approx_grad=False, bounds=None, \
                                        pgtol=pgtol, factr=factr, maxiter=maxiters)
        time_main = time.perf_counter() - start_main

        # capture info
        info['fcn'] = fcn
        info['lbfgsbopts'] = lbfgsb_params
        info['lbfgsbout'] = info_dict
        info['finalf'] = f

        if printitn > 0:
            print("\nFinal objective: {}".format(f))
            print("Setup time: {}".format(time_setup0 + time_setup1))
            print("Main loop time: {}".format(time_main))
            print("Outer iterations:"
                  )  # TODO: access this value (see manpage for fmin_l_bfgs_b)
            print("Total iterations: {}".format(info_dict['nit']))
            print("L-BFGS-B exit message: {} ({})".format(
                info_dict['task'], info_dict['warnflag']))
        Mfin = vec2factors(x, sz, R, X_context)

    # Stochastic optimization
    else:
        # Timer - Setup (inside optimization)
        start_setup1 = time.perf_counter()
        if opt == "adam" or opt == "adagrad":
            print("{} not currently supported".format(opt))
            sys.exit(1)
        # prepare for sgd
        # initialize moments
        m = []
        v = []

        # Extract samples for estimating function value (i.e. call fsampler), these never change
        fsubs, fvals, fwgts = fsampler()

        # Compute initial estimated function value
        fest = tl_gcp_fg_est(M, fh, gh, fsubs, fvals, fwgts, True, False,
                             False, False)

        # Set up loop variables
        nfails = 0
        titers = 0

        M_weights = tl.copy(M[0])
        M_factors = []
        for k in range(nd):
            M_factors.append(tl.copy(M[1][k]))
        Msave = CPTensor(
            (M_weights, M_factors))  # save a copy of the initial model
        msave = m
        vsave = v
        fest_prev = fest[0]

        # Tracing the progress in function value by epoch
        fest_trace = tl.zeros(maxiters + 1)
        step_trace = tl.zeros(maxiters + 1)
        time_trace = tl.zeros(maxiters + 1)
        fest_trace[0] = fest[0]

        # Print status
        if printitn > 0:
            print("Begin main loop")
            print("Initial f-est: {}".format(fest[0]))

        time_setup1 = time.perf_counter() - start_setup1
        start_main = time.perf_counter()
        time_trace[0] = time.perf_counter() - start_setup0

        # Main loop - outer iteration
        for nepoch in range(maxiters):
            step = (decay**nfails) * rate
            # Main loop - inner iteration
            for iter in range(epciters):
                # Tracking iterations
                titers = titers + 1

                # Select subset for stochastic gradient (i.e. call gsampler)
                gsubs, gvals, gwts = gsampler()

                # Compute gradients for each mode
                Gest = tl_gcp_fg_est(M, fh, gh, gsubs, gvals, gwts, False,
                                     True, False, False)

                # Check for inf gradient
                for g in Gest[0]:
                    g_max = tl.max(g)
                    g_min = tl.min(g)
                    if math.isinf(g_max) or math.isinf(g_min):
                        print(
                            "Infinite gradient encountered! (epoch = {}, iter = {})"
                            .format(nepoch, iter))

                # TODO : add functionality for ADAM and ADAGRAD optimization
                # Take gradient step
                for k in range(nd):
                    M.factors[k] = M.factors[k] - step * Gest[0][k]

            # Estimate objective (i.e. call tl_gcp_fg_est)
            fest = tl_gcp_fg_est(M, fh, gh, fsubs, fvals, fwgts, True, False,
                                 False, False)

            # Save trace (fest & step)
            fest_trace[nepoch + 1] = fest[0]
            step_trace[nepoch + 1] = step

            # Check convergence condition
            failed_epoch = False
            if fest[0] > fest_prev:
                failed_epoch = True
            if failed_epoch:
                nfails += 1
            festtol_test = False
            if fest[0] < festtol:
                festtol_test = True

            # Reporting
            if printitn > 0 and (nepoch % printitn == 0 or failed_epoch
                                 or festtol_test):
                print("Epoch {}: f-est = {}, step = {}".format(
                    nepoch, fest[0], step),
                      end='')
                if failed_epoch:
                    print(
                        ", nfails = {} (resetting to solution from last epoch)"
                        .format(nfails))
                print("")

            # Rectify failed epoch or save current solution
            if failed_epoch:
                M = Msave
                m = msave
                v = vsave
                fest[0] = fest_prev
                titers = titers - epciters
            else:
                Msave = CPTensor((tl.copy(M.weights), tl.copy(M.factors)))
                msave = m
                vsave = v
                fest_prev = fest[0]

            time_trace[nepoch] = time.perf_counter() - start_setup0

            if (nfails > maxfails) or festtol_test:
                break
        Mfin = M
        time_main = time.perf_counter() - start_main

        # capture info
        info['fest_trace'] = fest_trace
        info['step_trace'] = step_trace
        info['time_trace'] = time_trace
        info['nepoch'] = nepoch

        # Report end of main loop
        if printitn > 0:
            print("End Main Loop")
            print("")
            print("Final f-east: {}".format(fest[0]))
            print("Setup time: {0:0.6f}".format(time_setup0 + time_setup1))
            print("Main loop time: {0:0.6f}".format(time_main))
            print("Total iterations: {}".format(nepoch * epciters))
    # Wrap up / capture remaining info
    info['mainTime'] = time_main
    info['setupTime0'] = time_setup0
    info['setupTime1'] = time_setup1
    info['setupTime'] = time_setup0 + time_setup1

    return Mfin