def perform(self, node, inputs, outputs): context = inputs[0][0].context # Input matrix. A = inputs[0] l, n = A.shape if l != n: raise ValueError('A must be a square matrix') lda = max(1, n) # cusolver operates on F ordered matrices, but A is expected # to be symmetric so it does not matter. # We copy A if needed if self.inplace: L = A else: L = pygpu.array(A, copy=True) # The output matrix will contain only the upper or lower # triangular factorization of A. If L is C ordered (it # probably is as it is the default in Theano) we just switch # the fill mode parameter of cusolver l_parameter = 0 if self.lower else 1 if L.flags['C_CONTIGUOUS']: l_parameter = 1 - l_parameter L_ptr = L.gpudata with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, l_parameter, n, L_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata cusolver.cusolverDnSpotrf(context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) val_dev_info = np.asarray(dev_info)[0] if val_dev_info > 0: raise LinAlgError('Cholesky decomposition failed (is A SPD?)') # cusolver leaves the elements in the matrix outside the considered # upper or lower triangle unchanged, so we need to put zeros outside # the triangle if self.lower: tril(L) else: triu(L) outputs[0][0] = L
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Input matrix. A = inputs[0] l, n = A.shape if l != n: raise ValueError('A must be a square matrix') lda = max(1, n) # cusolver operates on F ordered matrices, but A is expected # to be symmetric so it does not matter. # We copy A if needed if self.inplace: L = A else: L = pygpu.array(A, copy=True) # The output matrix will contain only the upper or lower # triangular factorization of A. If L is C ordered (it # probably is as it is the default in Theano) we just switch # the fill mode parameter of cusolver l_parameter = 0 if self.lower else 1 if L.flags['C_CONTIGUOUS']: l_parameter = 1 - l_parameter L_ptr = L.gpudata with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, l_parameter, n, L_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata cusolver.cusolverDnSpotrf( context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) val_dev_info = np.asarray(dev_info)[0] if val_dev_info > 0: raise LinAlgError('Cholesky decomposition failed (is A SPD?)') # cusolver leaves the elements in the matrix outside the considered # upper or lower triangle unchanged, so we need to put zeros outside # the triangle if self.lower: tril(L) else: triu(L) outputs[0][0] = L
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0] # Solution vectors. b = inputs[1] assert(len(A.shape) == 2) assert(len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k) # We copy A and b as cusolver operates inplace b = pygpu.array(b, copy=True, order='F') if not self.inplace: A = pygpu.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans if self.A_structure == 'symmetric': with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, 0, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSpotrf( context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) self.check_dev_info(dev_info) cusolverDnSpotrs( context.cusolver_handle, 0, n, m, A_ptr, lda, b_ptr, ldb, dev_info_ptr) else: # general case for A with context: workspace_size = cusolver.cusolverDnSgetrf_bufferSize( context.cusolver_handle, n, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSgetrf( context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) self.check_dev_info(dev_info) cusolver.cusolverDnSgetrs( context.cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b
def perform(self, node, inputs, outputs): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0] # Solution vectors. b = inputs[1] assert (len(A.shape) == 2) assert (len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k) # We copy A and b as cusolver operates inplace b = pygpu.array(b, copy=True, order='F') if not self.inplace: A = pygpu.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans if self.A_structure == 'symmetric': with context: workspace_size = cusolver.cusolverDnSpotrf_bufferSize( context.cusolver_handle, 0, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSpotrf(context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr) self.check_dev_info(dev_info) cusolverDnSpotrs(context.cusolver_handle, 0, n, m, A_ptr, lda, b_ptr, ldb, dev_info_ptr) else: # general case for A with context: workspace_size = cusolver.cusolverDnSgetrf_bufferSize( context.cusolver_handle, n, n, A_ptr, lda) workspace = pygpu.zeros(workspace_size, dtype='float32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context) dev_info = pygpu.zeros((1, ), dtype='int32', context=context) workspace_ptr = workspace.gpudata pivots_ptr = pivots.gpudata dev_info_ptr = dev_info.gpudata with context: cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) self.check_dev_info(dev_info) cusolver.cusolverDnSgetrs(context.cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b