def __init__(self, input_size, ranks, output_size, verbose=1, **kwargs): super(TRL, self).__init__(**kwargs) self.ranks = list(ranks) self.verbose = verbose if isinstance(output_size, int): self.input_size = [input_size] else: self.input_size = list(input_size) if isinstance(output_size, int): self.output_size = [output_size] else: self.output_size = list(output_size) self.n_outputs = int(np.prod(output_size[1:])) # Core of the regression tensor weights self.core = nn.Parameter(tl.zeros(self.ranks), requires_grad=True) self.bias = nn.Parameter(tl.zeros(1), requires_grad=True) weight_size = list(self.input_size[1:]) + list(self.output_size[1:]) # Add and register the factors self.factors = [] for index, (in_size, rank) in enumerate(zip(weight_size, ranks)): self.factors.append( nn.Parameter(tl.zeros((in_size, rank)), requires_grad=True)) self.register_parameter('factor_{}'.format(index), self.factors[index]) self.core.data.uniform_(-0.1, 0.1) for f in self.factors: f.data.uniform_(-0.1, 0.1)
def simplex_prox(tensor, parameter): """ Projects the input tensor on the simplex of radius parameter. Parameters ---------- tensor : ndarray parameter : float Returns ------- ndarray References ---------- .. [1]: Held, Michael, Philip Wolfe, and Harlan P. Crowder. "Validation of subgradient optimization." Mathematical programming 6.1 (1974): 62-88. """ _, col = tl.shape(tensor) tensor = tl.clip(tensor, 0, tl.max(tensor)) tensor_sort = tl.sort(tensor, axis=0, descending=True) to_change = tl.sum(tl.where( tensor_sort > (tl.cumsum(tensor_sort, axis=0) - parameter), 1.0, 0.0), axis=0) difference = tl.zeros(col) for i in range(col): if to_change[i] > 0: difference = tl.index_update( difference, tl.index[i], tl.cumsum(tensor_sort, axis=0)[int(to_change[i] - 1), i]) difference = (difference - parameter) / to_change return tl.clip(tensor - difference, a_min=0)
def monotonicity_prox(tensor, decreasing=False): """ This function projects each column of the input array on the set of arrays so that x[1] <= x[2] <= ... <= x[n] (decreasing=False) or x[1] >= x[2] >= ... >= x[n] (decreasing=True) is satisfied columnwise. Parameters ---------- tensor : ndarray decreasing : If it is True, function returns columnwise monotone decreasing tensor. Otherwise, returned array will be monotone increasing. Default: True Returns ------- ndarray A tensor of which columns' are monotonic. References ---------- .. [1]: G. Chierchia, E. Chouzenoux, P. L. Combettes, and J.-C. Pesquet "The Proximity Operator Repository. User's guide" """ if tl.ndim(tensor) == 1: tensor = tl.reshape(tensor, [tl.shape(tensor)[0], 1]) elif tl.ndim(tensor) > 2: raise ValueError( "Monotonicity prox doesn't support an input which has more than 2 dimensions." ) tensor_mon = tl.copy(tensor) if decreasing: tensor_mon = tl.flip(tensor_mon, axis=0) row, column = tl.shape(tensor_mon) cum_sum = tl.cumsum(tensor_mon, axis=0) for j in range(column): assisted_tensor = tl.zeros([row, row]) for i in range(row): if i == 0: assisted_tensor = tl.index_update( assisted_tensor, tl.index[i, i:], cum_sum[i:, j] / tl.tensor(tl.arange(row - i) + 1, **tl.context(tensor))) else: assisted_tensor = tl.index_update( assisted_tensor, tl.index[i, i:], (cum_sum[i:, j] - cum_sum[i - 1, j]) / tl.tensor(tl.arange(row - i) + 1, **tl.context(tensor))) tensor_mon = tl.index_update(tensor_mon, tl.index[:, j], tl.max(assisted_tensor, axis=0)) for i in reversed(range(row - 1)): if tensor_mon[i, j] > tensor_mon[i + 1, j]: tensor_mon = tl.index_update(tensor_mon, tl.index[i, j], tensor_mon[i + 1, j]) if decreasing: tensor_mon = tl.flip(tensor_mon, axis=0) return tensor_mon
def __init__(self, input_size, rank: int, output_size, verbose=1, **kwargs): super(CPRL, self).__init__(**kwargs) self.rank = [rank] * (len(input_size)) self.verbose = verbose if isinstance(input_size, int): self.input_size = [input_size] else: self.input_size = list(input_size) if isinstance(output_size, int): self.output_size = [output_size] else: self.output_size = list(output_size) self.n_outputs = int(np.prod(output_size[1:])) self.bias = nn.Parameter(tl.zeros(1), requires_grad=True) # add and register factors self.factors = [] ranks = self.rank factor_size = list(input_size)[1:] + list(output_size)[1:] for index, (in_size, rank) in enumerate(zip(factor_size, ranks)): self.factors.append( nn.Parameter(tl.zeros((in_size, rank)), requires_grad=True)) self.register_parameter(f'factor_{index}', self.factors[index]) self.weights = nn.Parameter(tl.zeros((rank, )), requires_grad=True) # init params for f in self.factors: f.data.uniform_(-.1, .1) self.weights.data.uniform_(-.1, 1)
def cov(A, B): """Computes the mode 1 (mode 0 in python) contraction of 2 matrices.""" assert A.shape[0] == B.shape[0], "A and B need to have the same shape on axis 0" dimension_A = A.shape[1:] dimension_B = B.shape[1:] dimensions = list(dimension_A) + list(dimension_B) rmode_A = len(dimension_A) dim = A.shape[0] C = tl.zeros(dimensions) indices = [] for mode in dimensions: indices.append(range(mode)) for idx in product(*indices): idx_A, idx_B = list(idx[:rmode_A]), list(idx[rmode_A:]) C[idx] = np.sum( [A[tuple([i] + idx_A)] * B[tuple([i] + idx_B)] for i in range(dim)] ) return C
def sparsify_tensor(tensor, card): """Zeros out all elements in the `tensor` except `card` elements with maximum absolute values. Parameters ---------- tensor : ndarray card : int Desired number of non-zero elements in the `tensor` Returns ------- ndarray of shape tensor.shape """ if card >= np.prod(tensor.shape): return tensor bound = tl.sort(tl.abs(tensor), axis = None)[-card] return tl.where(tl.abs(tensor) < bound, tl.zeros(tensor.shape, **tl.context(tensor)), tensor)
def test_tr_to_tensor(): # Create ground truth TR factors factors = [tl.randn((2, 4, 3)), tl.randn((3, 5, 2)), tl.randn((2, 6, 2))] # Create tensor tensor = tl.zeros((4, 5, 6)) for i in range(4): for j in range(5): for k in range(6): product = tl.dot( tl.dot(factors[0][:, i, :], factors[1][:, j, :]), factors[2][:, k, :]) # TODO: add trace to backend instead of this tensor = tl.index_update( tensor, tl.index[i, j, k], tl.sum(product * tl.eye(product.shape[0]))) # Check that TR factors re-assemble to the original tensor assert_array_almost_equal(tensor, tr_to_tensor(factors))
def _fit_2d(self, X, Y): """ Compute the HOPLS for X and Y wrt the parameters R, Ln and Km for the special case mode_Y = 2. Parameters: X: tensorly Tensor, The target tensor of shape [i1, ... iN], N = 2. Y: tensorly Tensor, The target tensor of shape [j1, ... jM], M >= 3. Returns: G: Tensor, The core Tensor of the HOPLS for X, of shape (R, L2, ..., LN). P: List, The N-1 loadings of X. D: Tensor, The core Tensor of the HOPLS for Y, of shape (R, K2, ..., KN). Q: List, The N-1 loadings of Y. ts: Tensor, The latent vectors of the HOPLS, of shape (i1, R). """ # Initialization Er, Fr = X, Y P, T, W, Q = [], [], [], [] D = tl.zeros((self.R, self.R)) G = [] # Beginning of the algorithm # Gr, _ = tucker(Er, ranks=[1] + self.Ln) for r in range(self.R): if torch.norm(Er) > self.epsilon and torch.norm(Fr) > self.epsilon: # computing the covariance Cr = mode_dot(Er, Fr.t(), 0) # HOOI tucker decomposition of C Gr_C, latents = tucker(Cr, rank=[1] + self.Ln) # Getting P and Q loadings qr = latents[0] qr /= torch.norm(qr) # Pr = latents[1:] Pr = [a / torch.norm(a) for a in latents[1:]] P.append(Pr) tr = multi_mode_dot(Er, Pr, list(range(1, len(Pr) + 1)), transpose=True) # Gr_pi = torch.pinverse(matricize(Gr)) # tr = torch.mm(matricize(tr), Gr_pi) GrC_pi = torch.pinverse(matricize(Gr_C)) tr = torch.mm(matricize(tr), GrC_pi) tr /= torch.norm(tr) # recomposition of the core tensor of Y ur = torch.mm(Fr, qr) dr = torch.mm(ur.t(), tr) D[r, r] = dr Pkron = kronecker([Pr[self.N - n - 1] for n in range(self.N)]) # P.append(torch.mm(matricize(Gr), Pkron.t()).t()) # W.append(torch.mm(Pkron, Gr_pi)) Q.append(qr) T.append(tr) Gd = tl.tucker_to_tensor([Er, [tr] + Pr], transpose_factors=True) Gd_pi = torch.pinverse(matricize(Gd)) W.append(torch.mm(Pkron, Gd_pi)) # Deflation # X_hat = torch.mm(torch.cat(T, dim=1), torch.cat(P, dim=1).t()) # Er = X - np.reshape(X_hat, (Er.shape), order="F") Er = Er - tl.tucker_to_tensor([Gd, [tr] + Pr]) Fr = Fr - dr * torch.mm(tr, qr.t()) else: break Q = torch.cat(Q, dim=1) T = torch.cat(T, dim=1) # P = torch.cat(P, dim=1) W = torch.cat(W, dim=1) self.model = (P, Q, D, T, W) return self
def tensor_train_cross(input_tensor, rank, tol=1e-4, n_iter_max=100): """TT (tensor-train) decomposition via cross-approximation (TTcross) [1] Decomposes `input_tensor` into a sequence of order-3 tensors of given rank. (factors/cores) Rather than directly decompose the whole tensor, we sample fibers based on skeleton decomposition. We initialize a random tensor-train and sweep from left to right and right to left. On each core, we shape the core as a matrix and choose the fibers indices by finding maximum-volume submatrix and update the core. Advantage: faster The main advantage of TTcross is that it doesn't need to evaluate all the entries of the tensor. For a tensor_shape^tensor_order tensor, SVD needs O(tensor_shape^tensor_order) runtime, but TTcross' runtime is linear in tensor_shape and tensor_order, which makes it feasible in high dimension. Disadvantage: less accurate TTcross may underestimate the error, since it only evaluates partial entries of the tensor. Besides, in contrast to its practical fast performance, there is no theoretical guarantee of it convergence. Parameters ---------- input_tensor : tensorly.tensor The tensor to decompose. rank : {int, int list} maximum allowable TT rank of the factors if int, then this is the same for all the factors if int list, then rank[k] is the rank of the kth factor tol : float accuracy threshold for outer while-loop n_iter_max : int maximum iterations of outer while-loop (the 'crosses' or 'sweeps' sampled) Returns ------- factors : TT factors order-3 tensors of the TT decomposition Examples -------- Generate a 5^3 tensor, and decompose it into tensor-train of 3 factors, with rank = [1,3,3,1] >>> tensor = tl.tensor(np.arange(5**3).reshape(5,5,5)) >>> rank = [1, 3, 3, 1] >>> factors = tensor_train_cross(tensor, rank) print the first core: >>> print(factors[0]) .[[[ 24. 0. 4.] [ 49. 25. 29.] [ 74. 50. 54.] [ 99. 75. 79.] [124. 100. 104.]]] Notes ----- Pseudo-code [2]: 1. Initialization tensor_order cores and column indices 2. while (error > tol) 3. update the tensor-train from left to right: for Core 1 to Core tensor_order approximate the skeleton-decomposition by QR and maxvol 4. update the tensor-train from right to left: for Core tensor_order to Core 1 approximate the skeleton-decomposition by QR and maxvol 5. end while Acknowledgement: the main body of the code is modified based on TensorToolbox by Daniele Bigoni References ---------- .. [1] Ivan Oseledets and Eugene Tyrtyshnikov. Tt-cross approximation for multidimensional arrays. LinearAlgebra and its Applications, 432(1):70–88, 2010. .. [2] Sergey Dolgov and Robert Scheichl. A hybrid alternating least squares–tt cross algorithm for parametricpdes. arXiv preprint arXiv:1707.04562, 2017. """ # Check user input for errors tensor_shape = tl.shape(input_tensor) tensor_order = tl.ndim(input_tensor) if isinstance(rank, int): rank = [rank] * (tensor_order + 1) elif tensor_order + 1 != len(rank): message = 'Provided incorrect number of ranks. Should verify len(rank) == tl.ndim(tensor)+1, but len(rank) = {} while tl.ndim(tensor) + 1 = {}'.format( len(rank), tensor_order) raise (ValueError(message)) # Make sure iter's not a tuple but a list rank = list(rank) # Initialize rank if rank[0] != 1: print( 'Provided rank[0] == {} but boundary conditions dictate rank[0] == rank[-1] == 1: setting rank[0] to 1.'.format( rank[0])) rank[0] = 1 if rank[-1] != 1: print( 'Provided rank[-1] == {} but boundary conditions dictate rank[0] == rank[-1] == 1: setting rank[-1] to 1.'.format( rank[0])) # list col_idx: column indices (right indices) for skeleton-decomposition: indicate which columns used in each core. # list row_idx: row indices (left indices) for skeleton-decomposition: indicate which rows used in each core. # Initialize indice: random selection of column indices random_seed = None rng = check_random_state(random_seed) col_idx = [None] * tensor_order for k_col_idx in range(tensor_order - 1): col_idx[k_col_idx] = [] for i in range(rank[k_col_idx + 1]): newidx = tuple([rng.randint(tensor_shape[j]) for j in range(k_col_idx + 1, tensor_order)]) while newidx in col_idx[k_col_idx]: newidx = tuple([rng.randint(tensor_shape[j]) for j in range(k_col_idx + 1, tensor_order)]) col_idx[k_col_idx].append(newidx) # Initialize the cores of tensor-train factor_old = [tl.zeros((rank[k], tensor_shape[k], rank[k + 1]), **tl.context(input_tensor)) for k in range(tensor_order)] factor_new = [tl.tensor(rng.random_sample((rank[k], tensor_shape[k], rank[k + 1])), **tl.context(input_tensor)) for k in range(tensor_order)] iter = 0 error = tl.norm(tt_to_tensor(factor_old) - tt_to_tensor(factor_new), 2) threshold = tol * tl.norm(tt_to_tensor(factor_new), 2) for iter in range(n_iter_max): if error < threshold: break factor_old = factor_new factor_new = [None for i in range(tensor_order)] ###################################### # left-to-right step left_to_right_fiberlist = [] # list row_idx: list of (tensor_order-1) of lists of left indices row_idx = [[()]] for k in range(tensor_order - 1): (next_row_idx, fibers_list) = left_right_ttcross_step(input_tensor, k, rank, row_idx, col_idx) # update row indices left_to_right_fiberlist.extend(fibers_list) row_idx.append(next_row_idx) # end left-to-right step ############################################### ############################################### # right-to-left step right_to_left_fiberlist = [] # list col_idx: list (tensor_order-1) of lists of right indices col_idx = [None] * tensor_order col_idx[-1] = [()] for k in range(tensor_order, 1, -1): (next_col_idx, fibers_list, Q_skeleton) = right_left_ttcross_step(input_tensor, k, rank, row_idx, col_idx) # update col indices right_to_left_fiberlist.extend(fibers_list) col_idx[k - 2] = next_col_idx # Compute cores try: factor_new[k - 1] = tl.transpose(Q_skeleton) factor_new[k - 1] = tl.reshape(factor_new[k - 1], (rank[k - 1], tensor_shape[k - 1], rank[k])) except: # The rank should not be larger than the input tensor's size raise (ValueError("The rank is too large compared to the size of the tensor. Try with small rank.")) # Add the last core idx = (slice(None, None, None),) + tuple(zip(*col_idx[0])) core = input_tensor[idx] core = tl.reshape(core, (tensor_shape[0], 1, rank[1])) core = tl.transpose(core, (1, 0, 2)) factor_new[0] = core # end right-to-left step ################################################ # check the error for while-loop error = tl.norm(tt_to_tensor(factor_old) - tt_to_tensor(factor_new), 2) threshold = tol * tl.norm(tt_to_tensor(factor_new), 2) print("It: " + str(iter) + "; error: " + str(error) + "; threshold: " + str(threshold)) # check convergence if iter >= n_iter_max: print('Maximum number of iterations reached.') if tl.norm(tt_to_tensor(factor_old) - tt_to_tensor(factor_new), 2) > tol * tl.norm(tt_to_tensor(factor_new), 2): print('Low Rank Approximation algorithm did not converge.') return factor_new
def maxvol(A): """ Find the rxr submatrix of maximal volume in A(nxr), n>=r We want to decompose matrix A as A = A[:,J] * (A[I,J])^-1 * A[I,:] This algorithm helps us find this submatrix A[I,J] from A, which has the largest determinant. We greedily find vector of max norm, and subtract its projection from the rest of rows. Parameters ---------- A: matrix The matrix to find maximal volume Returns ------- row_idx: list of int is the list or rows of A forming the matrix with maximal volume, A_inv: matrix is the inverse of the matrix with maximal volume. References ---------- S. A. Goreinov, I. V. Oseledets, D. V. Savostyanov, E. E. Tyrtyshnikov, N. L. Zamarashkin. How to find a good submatrix.Goreinov, S. A., et al. Matrix Methods: Theory, Algorithms and Applications: Dedicated to the Memory of Gene Golub. 2010. 247-256. Ali Çivril, Malik Magdon-Ismail On selecting a maximum volume sub-matrix of a matrix and related problems Theoretical Computer Science. Volume 410, Issues 47–49, 6 November 2009, Pages 4801-4811 """ (n, r) = tl.shape(A) # The index of row of the submatrix row_idx = tl.zeros(r) # Rest of rows / unselected rows rest_of_rows = tl.tensor(list(range(n)),dtype= tl.int64) # Find r rows iteratively i = 0 A_new = A while i < r: mask = list(range(tl.shape(A_new)[0])) # Compute the square of norm of each row rows_norms = tl.sum(A_new ** 2, axis=1) # If there is only one row of A left, let's just return it. MxNet is not robust about this case. if tl.shape(rows_norms) == (): row_idx[i] = rest_of_rows break # If a row is 0, we delete it. if any(rows_norms == 0): zero_idx = tl.argmin(rows_norms,axis=0) mask.pop(zero_idx) rest_of_rows = rest_of_rows[mask] A_new = A_new[mask,:] continue # Find the row of max norm max_row_idx = tl.argmax(rows_norms, axis=0) max_row = A[rest_of_rows[max_row_idx], :] # Compute the projection of max_row to other rows # projection a to b is computed as: <a,b> / sqrt(|a|*|b|) projection = tl.dot(A_new, tl.transpose(max_row)) normalization = tl.sqrt(rows_norms[max_row_idx] * rows_norms) # make sure normalization vector is of the same shape of projection (causing bugs for MxNet) normalization = tl.reshape(normalization, tl.shape(projection)) projection = projection/normalization # Subtract the projection from A_new: b <- b - a * projection A_new = A_new - A_new * tl.reshape(projection, (tl.shape(A_new)[0], 1)) # Delete the selected row mask.pop(max_row_idx) A_new = A_new[mask,:] # update the row_idx and rest_of_rows row_idx[i] = rest_of_rows[max_row_idx] rest_of_rows = rest_of_rows[mask] i = i + 1 row_idx = tl.tensor(row_idx, dtype=tl.int64) inverse = tl.solve(A[row_idx,:], tl.eye(tl.shape(A[row_idx,:])[0], **tl.context(A))) row_idx = tl.to_numpy(row_idx) return row_idx, inverse
def constrained_parafac(tensor, rank, n_iter_max=100, n_iter_max_inner=10, init='svd', svd='numpy_svd', tol_outer=1e-8, tol_inner=1e-6, random_state=None, verbose=0, return_errors=False, cvg_criterion='abs_rec_error', fixed_modes=None, non_negative=None, l1_reg=None, l2_reg=None, l2_square_reg=None, unimodality=None, normalize=None, simplex=None, normalized_sparsity=None, soft_sparsity=None, smoothness=None, monotonicity=None, hard_sparsity=None): """CANDECOMP/PARAFAC decomposition via alternating optimization of alternating direction method of multipliers (AO-ADMM): Computes a rank-`rank` decomposition of `tensor` [1]_ such that:: tensor = [|weights; factors[0], ..., factors[-1] |], where factors are either penalized or constrained according to the user-defined constraint. In order to compute the factors efficiently, the ADMM algorithm introduces an auxilliary factor which is called factor_aux in the function. Parameters ---------- tensor : ndarray rank : int Number of components. n_iter_max : int Maximum number of iteration for outer loop n_iter_max_inner : int Number of iteration for inner loop init : {'svd', 'random', cptensor}, optional Type of factor matrix initialization. See `initialize_factors`. svd : str, default is 'numpy_svd' function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS tol_outer : float, optional (Default: 1e-8) Relative reconstruction error tolerance for outer loop. The algorithm is considered to have found a local minimum when the reconstruction error is less than `tol_outer`. tol_inner : float, optional (Default: 1e-6) Absolute reconstruction error tolerance for factor update during inner loop, i.e. ADMM optimization. random_state : {None, int, np.random.RandomState} verbose : int, optional Level of verbosity return_errors : bool, optional Activate return of iteration errors non_negative : bool or dictionary This constraint is clipping negative values to '0'. If it is True non-negative constraint is applied to all modes. l1_reg : float or list or dictionary, optional l2_reg : float or list or dictionary, optional l2_square_reg : float or list or dictionary, optional unimodality : bool or dictionary, optional If it is True unimodality constraint is applied to all modes. normalize : bool or dictionary, optional This constraint divides all the values by maximum value of the input array. If it is True normalize constraint is applied to all modes. simplex : float or list or dictionary, optional normalized_sparsity : float or list or dictionary, optional soft_sparsity : float or list or dictionary, optional smoothness : float or list or dictionary, optional monotonicity : bool or dictionary, optional hard_sparsity : float or list or dictionary, optional cvg_criterion : {'abs_rec_error', 'rec_error'}, optional Stopping criterion if `tol` is not None. If 'rec_error', algorithm stops at current iteration if ``(previous rec_error - current rec_error) < tol``. If 'abs_rec_error', algorithm terminates when `|previous rec_error - current rec_error| < tol`. fixed_modes : list, default is None A list of modes for which the initial value is not modified. The last mode cannot be fixed due to error computation. Returns ------- CPTensor : (weight, factors) * weights : 1D array of shape (rank, ) * factors : List of factors of the CP decomposition element `i` is of shape ``(tensor.shape[i], rank)`` errors : list A list of reconstruction errors at each iteration of the algorithms. References ---------- .. [1] T.G.Kolda and B.W.Bader, "Tensor Decompositions and Applications", SIAM REVIEW, vol. 51, n. 3, pp. 455-500, 2009. .. [2] Huang, Kejun, Nicholas D. Sidiropoulos, and Athanasios P. Liavas. "A flexible and efficient algorithmic framework for constrained matrix and tensor factorization." IEEE Transactions on Signal Processing 64.19 (2016): 5052-5065. """ rank = validate_cp_rank(tl.shape(tensor), rank=rank) _, _ = validate_constraints(non_negative=non_negative, l1_reg=l1_reg, l2_reg=l2_reg, l2_square_reg=l2_square_reg, unimodality=unimodality, normalize=normalize, simplex=simplex, normalized_sparsity=normalized_sparsity, soft_sparsity=soft_sparsity, smoothness=smoothness, monotonicity=monotonicity, hard_sparsity=hard_sparsity, n_const=tl.ndim(tensor)) weights, factors = initialize_constrained_parafac(tensor, rank, init=init, svd=svd, random_state=random_state, non_negative=non_negative, l1_reg=l1_reg, l2_reg=l2_reg, l2_square_reg=l2_square_reg, unimodality=unimodality, normalize=normalize, simplex=simplex, normalized_sparsity=normalized_sparsity, soft_sparsity=soft_sparsity, smoothness=smoothness, monotonicity=monotonicity, hard_sparsity=hard_sparsity) rec_errors = [] norm_tensor = tl.norm(tensor, 2) if fixed_modes is None: fixed_modes = [] if tl.ndim(tensor) - 1 in fixed_modes: warnings.warn('You asked for fixing the last mode, which is not supported.\n ' 'The last mode will not be fixed. Consider using tl.moveaxis()') fixed_modes.remove(tl.ndim(tensor) - 1) modes_list = [mode for mode in range(tl.ndim(tensor)) if mode not in fixed_modes] # ADMM inits dual_variables = [] factors_aux = [] for i in range(len(factors)): dual_variables.append(tl.zeros(tl.shape(factors[i]))) factors_aux.append(tl.transpose(tl.zeros(tl.shape(factors[i])))) for iteration in range(n_iter_max): if verbose > 1: print("Starting iteration", iteration + 1) for mode in modes_list: if verbose > 1: print("Mode", mode, "of", tl.ndim(tensor)) pseudo_inverse = tl.tensor(np.ones((rank, rank)), **tl.context(tensor)) for i, factor in enumerate(factors): if i != mode: pseudo_inverse = pseudo_inverse * tl.dot(tl.transpose(factor), factor) mttkrp = unfolding_dot_khatri_rao(tensor, (None, factors), mode) factors[mode], factors_aux[mode], dual_variables[mode] = admm(mttkrp, pseudo_inverse, factors[mode], dual_variables[mode], n_iter_max=n_iter_max_inner, n_const=tl.ndim(tensor), order=mode, non_negative=non_negative, l1_reg=l1_reg, l2_reg=l2_reg, l2_square_reg=l2_square_reg, unimodality=unimodality, normalize=normalize, simplex=simplex, normalized_sparsity=normalized_sparsity, soft_sparsity=soft_sparsity, smoothness=smoothness, monotonicity=monotonicity, hard_sparsity=hard_sparsity, tol=tol_inner) factors_norm = cp_norm((weights, factors)) iprod = tl.sum(tl.sum(mttkrp * factors[-1], axis=0) * weights) rec_error = tl.sqrt(tl.abs(norm_tensor ** 2 + factors_norm ** 2 - 2 * iprod)) / norm_tensor rec_errors.append(rec_error) constraint_error = 0 for mode in modes_list: constraint_error += tl.norm(factors[mode] - tl.transpose(factors_aux[mode])) / tl.norm(factors[mode]) if tol_outer: if iteration >= 1: rec_error_decrease = rec_errors[-2] - rec_errors[-1] if verbose: print("iteration {}, reconstruction error: {}, decrease = {}".format(iteration, rec_error, rec_error_decrease)) if constraint_error < tol_outer: break if cvg_criterion == 'abs_rec_error': stop_flag = abs(rec_error_decrease) < tol_outer elif cvg_criterion == 'rec_error': stop_flag = rec_error_decrease < tol_outer else: raise TypeError("Unknown convergence criterion") if stop_flag: if verbose: print("PARAFAC converged after {} iterations".format(iteration)) break else: if verbose: print('reconstruction error={}'.format(rec_errors[-1])) cp_tensor = CPTensor((weights, factors)) if return_errors: return cp_tensor, rec_errors else: return cp_tensor
def fista(UtM, UtU, x=None, n_iter_max=100, non_negative=True, sparsity_coef=0, lr=None, tol=10e-8): """ Fast Iterative Shrinkage Thresholding Algorithm (FISTA) Computes an approximate (nonnegative) solution for Ux=M linear system. Parameters ---------- UtM : ndarray Pre-computed product of the transposed of U and M UtU : ndarray Pre-computed product of the transposed of U and U x : init Default: None n_iter_max : int Maximum number of iteration Default: 100 non_negative : bool, default is False if True, result will be non-negative lr : float learning rate Default : None sparsity_coef : float or None tol : float stopping criterion Returns ------- x : approximate solution such that Ux = M Notes ----- We solve the following problem :math: `1/2 ||m - Ux ||_2^2 + \\lambda |x|_1` Reference ---------- [1] : Beck, A., & Teboulle, M. (2009). A fast iterative shrinkage-thresholding algorithm for linear inverse problems. SIAM journal on imaging sciences, 2(1), 183-202. """ if sparsity_coef is None: sparsity_coef = 0 if x is None: x = tl.zeros(tl.shape(UtM), **tl.context(UtM)) if lr is None: lr = 1 / (tl.partial_svd(UtU)[1][0]) # Parameters momentum_old = tl.tensor(1.0) norm_0 = 0.0 x_update = tl.copy(x) for iteration in range(n_iter_max): if isinstance(UtU, list): x_gradient = -UtM + tl.tenalg.multi_mode_dot( x_update, UtU, transpose=False) + sparsity_coef else: x_gradient = -UtM + tl.dot(UtU, x_update) + sparsity_coef if non_negative is True: x_gradient = tl.where(lr * x_gradient < x_update, x_gradient, x_update / lr) x_new = x_update - lr * x_gradient momentum = (1 + tl.sqrt(1 + 4 * momentum_old**2)) / 2 x_update = x_new + ((momentum_old - 1) / momentum) * (x_new - x) momentum_old = momentum x = tl.copy(x_new) norm = tl.norm(lr * x_gradient) if iteration == 1: norm_0 = norm if norm < tol * norm_0: break return x
def active_set_nnls(Utm, UtU, x=None, n_iter_max=100, tol=10e-8): """ Active set algorithm for non-negative least square solution. Computes an approximate non-negative solution for Ux=m linear system. Parameters ---------- Utm : vectorized ndarray Pre-computed product of the transposed of U and m UtU : ndarray Pre-computed Kronecker product of the transposed of U and U x : init Default: None n_iter_max : int Maximum number of iteration Default: 100 tol : float Early stopping criterion Returns ------- x : ndarray Notes ----- This function solves following problem: .. math:: \\begin{equation} \\min_{x} ||Ux - m||^2 \\end{equation} According to [1], non-negativity-constrained least square estimation problem becomes: .. math:: \\begin{equation} x' = (Utm) - (UTU)\\times x \\end{equation} Reference ---------- [1] : Bro, R., & De Jong, S. (1997). A fast non‐negativity‐constrained least squares algorithm. Journal of Chemometrics: A Journal of the Chemometrics Society, 11(5), 393-401. """ if tl.get_backend() == 'tensorflow': raise ValueError( "Active set is not supported with the tensorflow backend. Consider using fista method with tensorflow." ) if x is None: x_vec = tl.zeros(tl.shape(UtU)[1], **tl.context(UtU)) else: x_vec = tl.base.tensor_to_vec(x) x_gradient = Utm - tl.dot(UtU, x_vec) passive_set = x_vec > 0 active_set = x_vec <= 0 support_vec = tl.zeros(tl.shape(x_vec), **tl.context(x_vec)) for iteration in range(n_iter_max): if iteration > 0 or tl.all(x_vec == 0): indice = tl.argmax(x_gradient) passive_set = tl.index_update(passive_set, tl.index[indice], True) active_set = tl.index_update(active_set, tl.index[indice], False) # To avoid singularity error when initial x exists try: passive_solution = tl.solve(UtU[passive_set, :][:, passive_set], Utm[passive_set]) indice_list = [] for i in range(tl.shape(support_vec)[0]): if passive_set[i]: indice_list.append(i) support_vec = tl.index_update( support_vec, tl.index[int(i)], passive_solution[len(indice_list) - 1]) else: support_vec = tl.index_update(support_vec, tl.index[int(i)], 0) # Start from zeros if solve is not achieved except: x_vec = tl.zeros(tl.shape(UtU)[1]) support_vec = tl.zeros(tl.shape(x_vec), **tl.context(x_vec)) passive_set = x_vec > 0 active_set = x_vec <= 0 if tl.any(active_set): indice = tl.argmax(x_gradient) passive_set = tl.index_update(passive_set, tl.index[indice], True) active_set = tl.index_update(active_set, tl.index[indice], False) passive_solution = tl.solve(UtU[passive_set, :][:, passive_set], Utm[passive_set]) indice_list = [] for i in range(tl.shape(support_vec)[0]): if passive_set[i]: indice_list.append(i) support_vec = tl.index_update( support_vec, tl.index[int(i)], passive_solution[len(indice_list) - 1]) else: support_vec = tl.index_update(support_vec, tl.index[int(i)], 0) # update support vector if it is necessary if tl.min(support_vec[passive_set]) <= 0: for i in range(len(passive_set)): alpha = tl.min( x_vec[passive_set][support_vec[passive_set] <= 0] / (x_vec[passive_set][support_vec[passive_set] <= 0] - support_vec[passive_set][support_vec[passive_set] <= 0])) update = alpha * (support_vec - x_vec) x_vec = x_vec + update passive_set = x_vec > 0 active_set = x_vec <= 0 passive_solution = tl.solve( UtU[passive_set, :][:, passive_set], Utm[passive_set]) indice_list = [] for i in range(tl.shape(support_vec)[0]): if passive_set[i]: indice_list.append(i) support_vec = tl.index_update( support_vec, tl.index[int(i)], passive_solution[len(indice_list) - 1]) else: support_vec = tl.index_update(support_vec, tl.index[int(i)], 0) if tl.any(passive_set) != True or tl.min( support_vec[passive_set]) > 0: break # set x to s x_vec = tl.clip(support_vec, 0, tl.max(support_vec)) # gradient update x_gradient = Utm - tl.dot(UtU, x_vec) if tl.any(active_set) != True or tl.max(x_gradient[active_set]) <= tol: break return x_vec
def parafac2( tensor_slices, rank, n_iter_max=2000, init='random', svd='numpy_svd', normalize_factors=False, tol=1e-8, absolute_tol=1e-13, nn_modes=None, random_state=None, verbose=False, return_errors=False, n_iter_parafac=5, ): r"""PARAFAC2 decomposition [1]_ of a third order tensor via alternating least squares (ALS) Computes a rank-`rank` PARAFAC2 decomposition of the third-order tensor defined by `tensor_slices`. The decomposition is on the form :math:`(A [B_i] C)` such that the i-th frontal slice, :math:`X_i`, of :math:`X` is given by .. math:: X_i = B_i diag(a_i) C^T, where :math:`diag(a_i)` is the diagonal matrix whose nonzero entries are equal to the :math:`i`-th row of the :math:`I \times R` factor matrix :math:`A`, :math:`B_i` is a :math:`J_i \times R` factor matrix such that the cross product matrix :math:`B_{i_1}^T B_{i_1}` is constant for all :math:`i`, and :math:`C` is a :math:`K \times R` factor matrix. To compute this decomposition, we reformulate the expression for :math:`B_i` such that .. math:: B_i = P_i B, where :math:`P_i` is a :math:`J_i \times R` orthogonal matrix and :math:`B` is a :math:`R \times R` matrix. An alternative formulation of the PARAFAC2 decomposition is that the tensor element :math:`X_{ijk}` is given by .. math:: X_{ijk} = \sum_{r=1}^R A_{ir} B_{ijr} C_{kr}, with the same constraints hold for :math:`B_i` as above. Parameters ---------- tensor_slices : ndarray or list of ndarrays Either a third order tensor or a list of second order tensors that may have different number of rows. Note that the second mode factor matrices are allowed to change over the first mode, not the third mode as some other implementations use (see note below). rank : int Number of components. n_iter_max : int, optional (Default: 2000) Maximum number of iteration .. versionchanged:: 0.6.1 Previously, the default maximum number of iterations was 100. init : {'svd', 'random', CPTensor, Parafac2Tensor} Type of factor matrix initialization. See `initialize_factors`. svd : str, default is 'numpy_svd' function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS normalize_factors : bool (optional) If True, aggregate the weights of each factor in a 1D-tensor of shape (rank, ), which will contain the norms of the factors. Note that there may be some inaccuracies in the component weights. tol : float, optional (Default: 1e-8) Relative reconstruction error decrease tolerance. The algorithm is considered to have converged when :math:`\left|\| X - \hat{X}_{n-1} \|^2 - \| X - \hat{X}_{n} \|^2\right| < \epsilon \| X - \hat{X}_{n-1} \|^2`. That is, when the relative change in sum of squared error is less than the tolerance. .. versionchanged:: 0.6.1 Previously, the stopping condition was :math:`\left|\| X - \hat{X}_{n-1} \| - \| X - \hat{X}_{n} \|\right| < \epsilon`. absolute_tol : float, optional (Default: 1e-13) Absolute reconstruction error tolearnce. The algorithm is considered to have converged when :math:`\left|\| X - \hat{X}_{n-1} \|^2 - \| X - \hat{X}_{n} \|^2\right| < \epsilon_\text{abs}`. That is, when the relative sum of squared error is less than the specified tolerance. The absolute tolerance is necessary for stopping the algorithm when used on noise-free data that follows the PARAFAC2 constraint. If None, then the machine precision + 1000 will be used. nn_modes: None, 'all' or array of integers (Default: None) Used to specify which modes to impose non-negativity constraints on. We cannot impose non-negativity constraints on the the B-mode (mode 1) with the ALS algorithm, so if this mode is among the constrained modes, then a warning will be shown (see notes for more info). random_state : {None, int, np.random.RandomState} verbose : int, optional Level of verbosity return_errors : bool, optional Activate return of iteration errors n_iter_parafac : int, optional Number of PARAFAC iterations to perform for each PARAFAC2 iteration Returns ------- Parafac2Tensor : (weight, factors, projection_matrices) * weights : 1D array of shape (rank, ) all ones if normalize_factors is False (default), weights of the (normalized) factors otherwise * factors : List of factors of the CP decomposition element `i` is of shape (tensor.shape[i], rank) * projection_matrices : List of projection matrices used to create evolving factors. errors : list A list of reconstruction errors at each iteration of the algorithms. References ---------- .. [1] Kiers, H.A.L., ten Berge, J.M.F. and Bro, R. (1999), PARAFAC2—Part I. A direct fitting algorithm for the PARAFAC2 model. J. Chemometrics, 13: 275-294. Notes ----- This formulation of the PARAFAC2 decomposition is slightly different from the one in [1]_. The difference lies in that here, the second mode changes over the first mode, whereas in [1]_, the second mode changes over the third mode. We made this change since that means that the function accept both lists of matrices and a single nd-array as input without any reordering of the modes. Because of the reformulation above, :math:`B_i = P_i B`, the :math:`B_i` matrices cannot be constrained to be non-negative with ALS. If this mode is constrained to be non-negative, then :math:`B` will be non-negative, but not the orthogonal `P_i` matrices. Consequently, the `B_i` matrices are unlikely to be non-negative. """ weights, factors, projections = initialize_decomposition( tensor_slices, rank, init=init, svd=svd, random_state=random_state) rec_errors = [] norm_tensor = tl.sqrt( sum(tl.norm(tensor_slice, 2)**2 for tensor_slice in tensor_slices)) svd_fun = _get_svd(svd) if absolute_tol is None: absolute_tol = tl.eps(factors[0].dtype) * 1000 # If nn_modes is set, we use HALS, otherwise, we use the standard parafac implementation. if nn_modes is None: def parafac_updates(X, w, f): return parafac(X, rank, n_iter_max=n_iter_parafac, init=(w, f), svd=svd, orthogonalise=False, verbose=verbose, return_errors=False, normalize_factors=False, mask=None, random_state=random_state, tol=1e-100)[1] else: if nn_modes == 'all' or 1 in nn_modes: warn( "Mode `1` of PARAFAC2 fitted with ALS cannot be constrained to be truly non-negative. See the documentation for more info." ) def parafac_updates(X, w, f): return non_negative_parafac_hals(X, rank, n_iter_max=n_iter_parafac, init=(w, f), svd=svd, nn_modes=nn_modes, verbose=verbose, return_errors=False, tol=1e-100)[1] projected_tensor = tl.zeros([factor.shape[0] for factor in factors], **T.context(factors[0])) for iteration in range(n_iter_max): if verbose: print("Starting iteration", iteration) factors[1] = factors[1] * T.reshape(weights, (1, -1)) weights = T.ones(weights.shape, **tl.context(tensor_slices[0])) projections = _compute_projections(tensor_slices, factors, svd_fun, out=projections) projected_tensor = _project_tensor_slices(tensor_slices, projections, out=projected_tensor) factors = parafac_updates(projected_tensor, weights, factors) if normalize_factors: new_factors = [] for factor in factors: norms = T.norm(factor, axis=0) norms = tl.where( tl.abs(norms) <= tl.eps(factor.dtype), tl.ones(tl.shape(norms), **tl.context(factors[0])), norms) weights = weights * norms new_factors.append(factor / (tl.reshape(norms, (1, -1)))) factors = new_factors if tol: rec_error = _parafac2_reconstruction_error( tensor_slices, (weights, factors, projections)) rec_error /= norm_tensor rec_errors.append(rec_error) if iteration >= 1: if verbose: print('PARAFAC2 reconstruction error={}, variation={}.'. format(rec_errors[-1], rec_errors[-2] - rec_errors[-1])) if abs(rec_errors[-2]**2 - rec_errors[-1]**2) < ( tol * rec_errors[-2]**2) or rec_errors[-1]**2 < absolute_tol: if verbose: print('converged in {} iterations.'.format(iteration)) break else: if verbose: print('PARAFAC2 reconstruction error={}'.format( rec_errors[-1])) parafac2_tensor = Parafac2Tensor((weights, factors, projections)) if return_errors: return parafac2_tensor, rec_errors else: return parafac2_tensor
def test_clips_all_negative_tensor_correctly(): # Regression test for bug found with the pytorch backend negative_valued_tensor = tl.zeros((10, 10)) - 0.1 clipped_tensor = tl.clip(negative_valued_tensor, 0) assert tl.all(clipped_tensor == 0)
def parafac2(tensor_slices, rank, n_iter_max=100, init='random', svd='numpy_svd', normalize_factors=False, tol=1e-8, random_state=None, verbose=False, return_errors=False, n_iter_parafac=5): r"""PARAFAC2 decomposition [1]_ of a third order tensor via alternating least squares (ALS) Computes a rank-`rank` PARAFAC2 decomposition of the third-order tensor defined by `tensor_slices`. The decomposition is on the form :math:`(A [B_i] C)` such that the i-th frontal slice, :math:`X_i`, of :math:`X` is given by .. math:: X_i = B_i diag(a_i) C^T, where :math:`diag(a_i)` is the diagonal matrix whose nonzero entries are equal to the :math:`i`-th row of the :math:`I \times R` factor matrix :math:`A`, :math:`B_i` is a :math:`J_i \times R` factor matrix such that the cross product matrix :math:`B_{i_1}^T B_{i_1}` is constant for all :math:`i`, and :math:`C` is a :math:`K \times R` factor matrix. To compute this decomposition, we reformulate the expression for :math:`B_i` such that .. math:: B_i = P_i B, where :math:`P_i` is a :math:`J_i \times R` orthogonal matrix and :math:`B` is a :math:`R \times R` matrix. An alternative formulation of the PARAFAC2 decomposition is that the tensor element :math:`X_{ijk}` is given by .. math:: X_{ijk} = \sum_{r=1}^R A_{ir} B_{ijr} C_{kr}, with the same constraints hold for :math:`B_i` as above. Parameters ---------- tensor_slices : ndarray or list of ndarrays Either a third order tensor or a list of second order tensors that may have different number of rows. Note that the second mode factor matrices are allowed to change over the first mode, not the third mode as some other implementations use (see note below). rank : int Number of components. n_iter_max : int Maximum number of iteration init : {'svd', 'random', CPTensor, Parafac2Tensor} Type of factor matrix initialization. See `initialize_factors`. svd : str, default is 'numpy_svd' function to use to compute the SVD, acceptable values in tensorly.SVD_FUNS normalize_factors : bool (optional) If True, aggregate the weights of each factor in a 1D-tensor of shape (rank, ), which will contain the norms of the factors. Note that there may be some inaccuracies in the component weights. tol : float, optional (Default: 1e-8) Relative reconstruction error tolerance. The algorithm is considered to have found the global minimum when the reconstruction error is less than `tol`. random_state : {None, int, np.random.RandomState} verbose : int, optional Level of verbosity return_errors : bool, optional Activate return of iteration errors n_iter_parafac: int, optional Number of PARAFAC iterations to perform for each PARAFAC2 iteration Returns ------- Parafac2Tensor : (weight, factors, projection_matrices) * weights : 1D array of shape (rank, ) all ones if normalize_factors is False (default), weights of the (normalized) factors otherwise * factors : List of factors of the CP decomposition element `i` is of shape (tensor.shape[i], rank) * projection_matrices : List of projection matrices used to create evolving factors. errors : list A list of reconstruction errors at each iteration of the algorithms. References ---------- .. [1] Kiers, H.A.L., ten Berge, J.M.F. and Bro, R. (1999), PARAFAC2—Part I. A direct fitting algorithm for the PARAFAC2 model. J. Chemometrics, 13: 275-294. Notes ----- This formulation of the PARAFAC2 decomposition is slightly different from the one in [1]_. The difference lies in that here, the second mode changes over the first mode, whereas in [1]_, the second mode changes over the third mode. We made this change since that means that the function accept both lists of matrices and a single nd-array as input without any reordering of the modes. """ weights, factors, projections = initialize_decomposition( tensor_slices, rank, init=init, svd=svd, random_state=random_state) rec_errors = [] norm_tensor = tl.sqrt( sum(tl.norm(tensor_slice, 2)**2 for tensor_slice in tensor_slices)) svd_fun = _get_svd(svd) projected_tensor = tl.zeros([factor.shape[0] for factor in factors], **T.context(factors[0])) for iteration in range(n_iter_max): if verbose: print("Starting iteration", iteration) factors[1] = factors[1] * T.reshape(weights, (1, -1)) weights = T.ones(weights.shape, **tl.context(tensor_slices[0])) projections = _compute_projections(tensor_slices, factors, svd_fun, out=projections) projected_tensor = _project_tensor_slices(tensor_slices, projections, out=projected_tensor) _, factors = parafac(projected_tensor, rank, n_iter_max=n_iter_parafac, init=(weights, factors), svd=svd, orthogonalise=False, verbose=verbose, return_errors=False, normalize_factors=False, mask=None, random_state=random_state, tol=1e-100) if normalize_factors: new_factors = [] for factor in factors: norms = T.norm(factor, axis=0) norms = tl.where( tl.abs(norms) <= tl.eps(factor.dtype), tl.ones(tl.shape(norms), **tl.context(factors[0])), norms) weights = weights * norms new_factors.append(factor / (tl.reshape(norms, (1, -1)))) factors = new_factors if tol: rec_error = _parafac2_reconstruction_error( tensor_slices, (weights, factors, projections)) rec_error /= norm_tensor rec_errors.append(rec_error) if iteration >= 1: if verbose: print('PARAFAC2 reconstruction error={}, variation={}.'. format(rec_errors[-1], rec_errors[-2] - rec_errors[-1])) if tol and abs(rec_errors[-2] - rec_errors[-1]) < tol: if verbose: print('converged in {} iterations.'.format(iteration)) break else: if verbose: print('PARAFAC2 reconstruction error={}'.format( rec_errors[-1])) parafac2_tensor = Parafac2Tensor((weights, factors, projections)) if return_errors: return parafac2_tensor, rec_errors else: return parafac2_tensor
def gcp(X, R, type='normal', func=None, grad=None, lower=None,\ opt='lbfgsb', mask=None, maxiters=1000, \ init='random', printitn=10, state=None, factr=1e7, pgtol=1e-4, \ fsamp=None, gsamp=None, oversample=1.1, sampler='uniform', \ fsampler=None, rate=1e-3, decay=0.1, maxfails=1, epciters=1000, \ festtol=-math.inf, beta1=0.9, beta2=0.999, epsilon=1e-8): """Generalized CANDECOMP/PARAFAC (GCP) decomposition via all-at-once optimization (OPT) [1] Computes a rank-'R' decomposition of 'tensor' such that:: tensor = [|weights; factors[0], ..., factors[-1] |]. GCP-OPT allows the use of a variety of statistically motivated loss functions suited to the data held in a tensor (i.e. continuous, discrete, binary, etc) Parameters ---------- X : ndarray Tensor to factorize **COMING SOON** Sparse tensor support R : int Rank of decomposition (Number of components). type : str, Type of objective function used Options include: 'normal' or 'gaussian' - Gaussian for real-valued data (DEFAULT) 'binary' or 'bernoulli-odds' - Bernoulli w/ odds link for binary data 'bernoulli-logit' - Bernoulli w/ logit link for binary data 'count' or 'poisson' - Poisson for count data 'poisson-log' - Poisson w/ log link for count data 'rayleigh' - Rayleigh distribution for real-valued data 'gamma' - Gamma distribution for non-negative real-valued data **COMING SOON**: 'huber (DELTA) - Similar to Gaussian, for real-valued data 'negative-binomial (r)' - Negative binomial for count data 'beta (BETA)' - Beta divergence for non-negative real-valued data 'user-specified' - Customized objective function provided by user func: lambda function User specified custom objective function, eg. lambda x, m: (m-x)**2 grad: lambda function User specified custom gradient function, eg. lambda x, m: 2*(m-x) lower: 0 or -inf Lower bound for custom objective/gradient opt : str Optimization method Options include: 'lbfgsb' - Bound-constrained limited-memory BFGS 'sgd' - Stochastic gradient descent (SGD) **COMING SOON** 'adam' - Momentum-based SGD method 'adagrad' - Adaptive gradient algorithm, well suited for sparse data If 'tensor' is dense, all 4 options can be used, 'lbfgsb' by default. **COMING SOON** - Sparse format support If 'tensor' is sparse, only 'sgd', 'adam' and 'adagrad' can be used, 'adam' by default. Each method has specific parameters, see documentation mask : ndarray Specifies a mask, 0's for missing/incomplete entries, 1's elsewhere, with the same shape as 'tensor'. **COMING SOON** - Missing/incomplete data simulation. maxiters : int Maximum number of outer iterations, 1000 by default. init : {'random', 'svd', cptensor} Initialization for factor matrices, 'random' by default. Options include: 'random' - random initialization from a uniform distribution on [0,1) 'svd' - initialize the `m`th factor matrix using the `rank` left singular vectors of the `m`th unfolding of the input tensor. cptensor - initialization provided by user. NOTE: weights are pulled in the last factor and then the weights are set to "1" for the output tensor. Initializations all result in a cptensor where the weights are one. printitn : int Print every n iterations; 0 for no printing, 10 by default. state : {None, int, np.random.RandomState} Seed for reproducable random number generation factr : float (L-BFGS-B parameter) Tolerance on the change of objective values. Defaults to 1e7. pgtol : float (L-BFGS-B parameter) Projected gradient tolerance. Defaults to 1e-5 sampler : {uniform, stratified, semi-stratified} Type of sampling to use for stochastic gradient (SGD/ADAM/ADAGRAD). Defaults to 'uniform' for dense tensors. Defaults to 'stratified' for sparse tensors. Options include: 'uniform' - Uniform random sampling **COMING SOON** 'stratified' - Stratified sampling, targets sparse data. Zero and nonzero values sampled separately. 'semi-stratified' - Similar to stratified sampling, but is more computationally efficient (See papers referenced). gsamp : int Number of samples for stochastic gradient (SGD/ADAM/ADAGRAD parameter). Generally set to be O(sum(shape)*R). **COMING SOON** For stratified or semi-stratified, this may be two numbers: - the number of nnz samples - the number of zero samples. If only one number is specified, then this value is used for both nnzs and zeros (total number of samples is 2x specified value in this case). fsampler : {'uniform', 'stratified', custom} Type of sampling for estimating objective function (SGD/ADAM/ADAGRAD parameter). Options include: 'uniform' - Uniform random sampling **COMING SOON** 'stratified' - Stratified sampling, targets sparse data. Zero and nonzero values sampled separately. custom - User-defined sampler (lambda function). Custom option is primarily useful in reusing sampled elements across multiple tests. fsamp : int (SGD/ADAM/ADAGRAD parameter) Number of samples to estimate objective function. This should generally be somewhat large since we want this sample to generate a reliable estimate of the true function value. oversample : float (Stratified sampling parameter) Factor to oversample when implicitly sampling zeros in the sparse case. Defaults to 1.1. Only adjust for very small tensors. rate : float (SGD/ADAM parameter) Initial learning rate. Defaults to 1e-3. decay : float (SGD/ADAM parameter) Amount to decrease learning rate when progress stagnates, i.e. no change in objective function between epochs. Defaults to 0.1. maxfails : int (SGD/ADAM parameter) Number of times to decrease the learning rate. Defaults to 1, may be set to zero. epciters : int (SGD/ADAM parameter) Iterations per epoch. Defaults to 1000. festtol : float (SGD/ADAM parameter) Quit estimation of function if it goes below this level. Defaults to -inf. beta1 : float (ADAM parameter) - generally doesn't need to be changed Defaults to 0.9 beta2 : float (ADAM parameter) - generally doesn't need to be changed Defaults to 0.999 epsilon : float (ADAM parameter) - generally doesn't need to be changed Defaults to 1e-8 Returns ------- Mfin : CPTensor Canonical polyadic decomposition of input tensor X Reference --------- [1] D. Hong, T. G. Kolda, J. A. Duersch, Generalized Canonical Polyadic Tensor Decomposition, SIAM Review, 62:133-163, 2020, https://doi.org/10.1137/18M1203626 [2] T. G. Kolda, D. Hong, Stochastic Gradients for Large-Scale Tensor Decomposition. SIAM J. Mathematics of Data Science, 2:1066-1095, 2020, https://doi.org/10.1137/19m1266265 """ # Timer - Setup (outside optimization) start_setup0 = time.perf_counter() # Initial setup nd = tl.ndim(X) sz = tl.shape(X) tsz = X.size X_context = tl.context(X) vecsz = 0 for i in range(nd): # tsz *= sz[i] vecsz += sz[i] vecsz *= R W = mask # Random set-up if state is not None: state = tl.check_random_state(state) # capture stats(nnzs, zeros, missing) nnonnzeros = 0 X = tl.tensor_to_vec(X) for i in X: if i > 0: nnonnzeros += 1 X = tl.reshape(X, sz) nzeros = tsz - nnonnzeros nmissing = 0 if W is not None: W = tl.tensor_to_vec(W) for i in range(tl.shape(W)[0]): if W[i] > 0: nmissing += 1 # TODO: is this right?? W = tl.reshape(W, sz) # Dictionary for storing important information regarding the decomposition problem info = {} info['tsz'] = tsz info[ 'nmissing'] = 0 # TODO: revisit once missing value functionality incorporated info['nnonnzeros'] = nnonnzeros info[ 'nzeros'] = nzeros # TODO: revisit once missing value functionality incorporated # Set up function, gradient, and bounds fh, gh, lb = validate_type(type, X) info['type'] = type info['fh'] = fh info['gh'] = gh info['lb'] = lb # initialize CP-tensor and make a copy to work with so as to have the starting guess M0 = initialize_cp(X, R, init=init, random_state=state) wghts0 = tl.copy(M0[0]) fcts0 = [] for i in range(nd): f = tl.copy(M0[1][i]) fcts0.append(f) M = CPTensor((wghts0, fcts0)) # Lambda weights are assumed to be all ones throughout, check initial guess satisfies assumption if not tl.all(M[0]): print("Initialization of CP tensor has failed (lambda weight(s) != 1.") sys.exit(1) # check optimization method if validate_opt(opt): print("Choose optimization method from: {lbfgsb, sgd}") sys.exit(1) use_stoc = False if opt != 'lbfgsb': use_stoc = True info['opt'] = opt # set up for stochastic optimization (e.g. sgd, adam, adagrad) if use_stoc: # set up fsampler, gsampler ---> uniform sampling only for now # TODO : add stratified, semi-stratified and user-specified sampling options if not sampler == "uniform": print( "Only uniform sampling currently supported for stochastic optimization." ) sys.exit(1) fsampler_type = sampler gsampler_type = sampler # setup fsampler f_samp = fsamp if f_samp == None: upper = np.maximum(math.ceil(tsz / 10), 10 ^ 6) f_samp = np.minimum(upper, tsz) # set up lambda function/function handle for uniform sampling fsampler = lambda: tl_sample_uniform(X, f_samp) fsampler_str = "{} with {} samples".format(fsampler_type, f_samp) # setup gsampler g_samp = gsamp if g_samp == None: upper = np.maximum(1000, math.ceil(10 * tsz / maxiters)) g_samp = np.minimum(upper, tsz) # setup lambda function/function handle for uniform sampling gsampler = lambda: tl_sample_uniform(X, g_samp) gsampler_str = "{} with {} samples".format(gsampler_type, g_samp) # capture the info info['fsampler'] = fsampler_str info['gsampler'] = gsampler_str info['fsamp'] = f_samp info['gsamp'] = g_samp time_setup0 = time.perf_counter() - start_setup0 # Welcome message if printitn > 0: print("GCP-OPT-{} (Generalized CP Tensor Decomposition)".format(opt)) print("------------------------------------------------") print("Tensor size:\t\t\t\t{} ({} total entries)".format(sz, tsz)) if nmissing > 0: print("Missing entries: {} ({})".format(nmissing, 100 * nmissing / tsz)) print("Generalized function type:\t{}".format(type)) print("Objective function:\t\t\t{}".format( inspect.getsource(fh).strip())) print("Gradient function:\t\t\t{}".format( inspect.getsource(gh).strip())) print("Lower bound of factors:\t\t{}".format(lb)) print("Optimization method:\t\t{}".format(opt)) if use_stoc: print("Max iterations (epochs): {}".format(maxiters)) print("Iterations per epoch: {}".format(epciters)) print("Learning rate / decay / maxfails: {} {} {}".format( rate, decay, maxfails)) print("Function Sampler: {}".format(fsampler_str)) print("Gradient Sampler: {}".format(gsampler_str)) else: print("Max iterations:\t\t\t\t{}".format(maxiters)) print("Projected gradient tol:\t\t{}\n".format(pgtol)) # Make like a zombie and start decomposing Mfin = None # L-BFGS-B optimization if opt == 'lbfgsb': # Timer - Setup (inside optimization) start_setup1 = time.perf_counter() # set up bounds for l-bfgs-b if lb = 0 bounds = None if lb == 0: lb = tl.zeros(tsz) ub = math.inf * tl.ones(tsz) fcn = lambda x: tl_gcp_fg(vec2factors(x, sz, R, X_context), X, fh, gh) m = factors2vec(M[1]) # capture params for l-bfgs-b lbfgsb_params = {} lbfgsb_params['x0'] = factors2vec(M0.factors) lbfgsb_params['printEvery'] = printitn lbfgsb_params['maxIts'] = maxiters lbfgsb_params['maxTotalIts'] = maxiters * 10 lbfgsb_params['factr'] = factr lbfgsb_params['pgtol'] = pgtol time_setup1 = time.perf_counter() - start_setup1 if printitn > 0: print("Begin main loop") # Timer - Main operation start_main = time.perf_counter() x, f, info_dict = fmin_l_bfgs_b(fcn, m, approx_grad=False, bounds=None, \ pgtol=pgtol, factr=factr, maxiter=maxiters) time_main = time.perf_counter() - start_main # capture info info['fcn'] = fcn info['lbfgsbopts'] = lbfgsb_params info['lbfgsbout'] = info_dict info['finalf'] = f if printitn > 0: print("\nFinal objective: {}".format(f)) print("Setup time: {}".format(time_setup0 + time_setup1)) print("Main loop time: {}".format(time_main)) print("Outer iterations:" ) # TODO: access this value (see manpage for fmin_l_bfgs_b) print("Total iterations: {}".format(info_dict['nit'])) print("L-BFGS-B exit message: {} ({})".format( info_dict['task'], info_dict['warnflag'])) Mfin = vec2factors(x, sz, R, X_context) # Stochastic optimization else: # Timer - Setup (inside optimization) start_setup1 = time.perf_counter() if opt == "adam" or opt == "adagrad": print("{} not currently supported".format(opt)) sys.exit(1) # prepare for sgd # initialize moments m = [] v = [] # Extract samples for estimating function value (i.e. call fsampler), these never change fsubs, fvals, fwgts = fsampler() # Compute initial estimated function value fest = tl_gcp_fg_est(M, fh, gh, fsubs, fvals, fwgts, True, False, False, False) # Set up loop variables nfails = 0 titers = 0 M_weights = tl.copy(M[0]) M_factors = [] for k in range(nd): M_factors.append(tl.copy(M[1][k])) Msave = CPTensor( (M_weights, M_factors)) # save a copy of the initial model msave = m vsave = v fest_prev = fest[0] # Tracing the progress in function value by epoch fest_trace = tl.zeros(maxiters + 1) step_trace = tl.zeros(maxiters + 1) time_trace = tl.zeros(maxiters + 1) fest_trace[0] = fest[0] # Print status if printitn > 0: print("Begin main loop") print("Initial f-est: {}".format(fest[0])) time_setup1 = time.perf_counter() - start_setup1 start_main = time.perf_counter() time_trace[0] = time.perf_counter() - start_setup0 # Main loop - outer iteration for nepoch in range(maxiters): step = (decay**nfails) * rate # Main loop - inner iteration for iter in range(epciters): # Tracking iterations titers = titers + 1 # Select subset for stochastic gradient (i.e. call gsampler) gsubs, gvals, gwts = gsampler() # Compute gradients for each mode Gest = tl_gcp_fg_est(M, fh, gh, gsubs, gvals, gwts, False, True, False, False) # Check for inf gradient for g in Gest[0]: g_max = tl.max(g) g_min = tl.min(g) if math.isinf(g_max) or math.isinf(g_min): print( "Infinite gradient encountered! (epoch = {}, iter = {})" .format(nepoch, iter)) # TODO : add functionality for ADAM and ADAGRAD optimization # Take gradient step for k in range(nd): M.factors[k] = M.factors[k] - step * Gest[0][k] # Estimate objective (i.e. call tl_gcp_fg_est) fest = tl_gcp_fg_est(M, fh, gh, fsubs, fvals, fwgts, True, False, False, False) # Save trace (fest & step) fest_trace[nepoch + 1] = fest[0] step_trace[nepoch + 1] = step # Check convergence condition failed_epoch = False if fest[0] > fest_prev: failed_epoch = True if failed_epoch: nfails += 1 festtol_test = False if fest[0] < festtol: festtol_test = True # Reporting if printitn > 0 and (nepoch % printitn == 0 or failed_epoch or festtol_test): print("Epoch {}: f-est = {}, step = {}".format( nepoch, fest[0], step), end='') if failed_epoch: print( ", nfails = {} (resetting to solution from last epoch)" .format(nfails)) print("") # Rectify failed epoch or save current solution if failed_epoch: M = Msave m = msave v = vsave fest[0] = fest_prev titers = titers - epciters else: Msave = CPTensor((tl.copy(M.weights), tl.copy(M.factors))) msave = m vsave = v fest_prev = fest[0] time_trace[nepoch] = time.perf_counter() - start_setup0 if (nfails > maxfails) or festtol_test: break Mfin = M time_main = time.perf_counter() - start_main # capture info info['fest_trace'] = fest_trace info['step_trace'] = step_trace info['time_trace'] = time_trace info['nepoch'] = nepoch # Report end of main loop if printitn > 0: print("End Main Loop") print("") print("Final f-east: {}".format(fest[0])) print("Setup time: {0:0.6f}".format(time_setup0 + time_setup1)) print("Main loop time: {0:0.6f}".format(time_main)) print("Total iterations: {}".format(nepoch * epciters)) # Wrap up / capture remaining info info['mainTime'] = time_main info['setupTime0'] = time_setup0 info['setupTime1'] = time_setup1 info['setupTime'] = time_setup0 + time_setup1 return Mfin