def to_torch(Xtr, Ytr, Xts, Yts, **kwargs): from falkon.sparse.sparse_tensor import SparseTensor import torch return (SparseTensor.from_scipy(Xtr), torch.from_numpy(Ytr), SparseTensor.from_scipy(Xts), torch.from_numpy(Yts), {})
def test_start_zero(self): device = 'cpu' indexptr = torch.tensor([0, 1, 3, 4], dtype=torch.long, device=device) index = torch.tensor([1, 0, 1, 0], dtype=torch.long, device=device) value = torch.tensor([2, 1, 3, 4], dtype=torch.float32, device=device) arr = SparseTensor(indexptr=indexptr, index=index, data=value, size=(3, 2), sparse_type="csr") arr_small = arr.narrow_rows(0, 2) sm_coo = arr_small.to_scipy().tocoo() self.assertEqual(sm_coo.row.tolist(), [0, 1, 1]) self.assertEqual(sm_coo.col.tolist(), [1, 0, 1]) self.assertEqual(sm_coo.data.tolist(), [2, 1, 3]) self.assertEqual(arr.indexptr.data_ptr(), arr_small.indexptr.data_ptr()) arr_small = arr.narrow_rows(0, 1) sm_coo = arr_small.to_scipy().tocoo() self.assertEqual(sm_coo.row.tolist(), [0]) self.assertEqual(sm_coo.col.tolist(), [1]) self.assertEqual(sm_coo.data.tolist(), [2]) self.assertEqual(arr.indexptr.data_ptr(), arr_small.indexptr.data_ptr())
def sparse_matmul(A: SparseTensor, B: SparseTensor, out: torch.Tensor) -> torch.Tensor: """Sparse*Sparse matrix multiplication. Output will be copied into dense `out` matrix. This function can be applied to CPU or CUDA tensors (but all tensors must be on the same device). Parameters ---------- A : SparseTensor N x D, sparse matrix. B : SparseTensor D x M, sparse matrix out : torch.Tensor Dense N x M tensor, it will hold the output of the multiplication. Returns ------- out : torch.Tensor The same tensor as the input `out` parameter. """ if A.nnz() == 0 or B.nnz() == 0: out.fill_(0.0) return out if A.is_cuda: return _sparse_matmul_cuda(A, B, out) else: return _sparse_matmul_cpu(A, B, out)
def fmm_cpu_sparse(X1: SparseTensor, X2: SparseTensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor], opt: BaseOptions) -> torch.Tensor: opt = _setup_opt(opt, is_cpu=True) ntot, dtot = X1.size() mtot = X2.size(0) if out is None: out = torch.empty(ntot, mtot, dtype=X1.dtype) if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel: avail_mem = _get_cpu_ram(opt, 0.9) if avail_mem <= 0: raise MemoryError("Memory insufficient for kernel evaluation.") blockwise_fmm_cpu_sparse(X1, X2, kernel, out, avail_mem) else: # Do the kernel computation on the spot out.fill_(0.0) ddd = kernel._prepare_sparse(X1, X2) kernel._apply_sparse(X1, X2.transpose_csc(), out) kernel._finalize(out, ddd) return out
def _prepare_sparse(self, X1: SparseTensor, X2: SparseTensor) -> DistKerContainer: sq1 = torch.empty(X1.size(0), dtype=X1.dtype, device=X1.device) sparse_ops.sparse_square_norm(X1, sq1) sq2 = torch.empty(X2.size(0), dtype=X1.dtype, device=X1.device) sparse_ops.sparse_square_norm(X2, sq2) return DistKerContainer(sq1=sq1.reshape(-1, 1), sq2=sq2.reshape(-1, 1))
def fmmv_cuda_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor, kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: opt = _setup_opt(opt) _check_contiguity((v, 'v'), (out, 'out')) N = X1.size(0) # Create output matrix if out is None: out = create_fortran((N, v.size(1)), v.dtype, 'cpu', pin_memory=True) out.fill_(0.0) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow_rows(block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(sparse_fmmv, args) return out
def fmm_cuda_sparse(X1: SparseTensor, X2: SparseTensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: opt = _setup_opt(opt) _check_contiguity((out, 'out')) N = X1.size(0) M = X2.size(0) if out is None: out = create_fortran((N, M), X1.dtype, 'cpu', pin_memory=True) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # If float32 we need to upcast to float64 to avoid numerical precision errors # in the kernel gpu_dtype = X1.dtype if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel: gpu_dtype = torch.float64 # Create the arguments passed to each subprocess args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmm(X1=X1.narrow_rows(block_sizes[i], bwidth), X2=X2, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, gpu_dtype=gpu_dtype, max_mem=g.usable_ram), g.Id)) _start_wait_processes(_sparse_fmm, args) torch.cuda.empty_cache() return out
def test_simple_transpose(self): for device in ('cpu', 'cuda:0'): with self.subTest(device=device): if device == 'cuda:0' and not torch.cuda.is_available(): self.skipTest("Cuda not available") indexptr = torch.tensor([0, 1, 3, 4], dtype=torch.long, device=device) index = torch.tensor([1, 0, 1, 0], dtype=torch.long, device=device) value = torch.tensor([2, 1, 3, 4], dtype=torch.float32, device=device) arr = SparseTensor(indexptr=indexptr, index=index, data=value, size=(3, 2), sparse_type="csr") tr_arr = arr.transpose_csc() self.assertEqual((2, 3), tr_arr.shape) tr_mat = tr_arr.to_scipy().tocoo() self.assertEqual(tr_mat.row.tolist(), [1, 0, 1, 0]) self.assertEqual(tr_mat.col.tolist(), [0, 1, 1, 2]) self.assertEqual(tr_mat.data.tolist(), [2, 1, 3, 4])
def test_cpu_matmul(self, mat1, mat2, expected): out = torch.empty_like(expected) mat1_csr = SparseTensor.from_scipy(scipy.sparse.csr_matrix(mat1)) mat2_csc = SparseTensor.from_scipy(scipy.sparse.csc_matrix(mat2)) sparse_matmul(mat1_csr, mat2_csc, out) torch.testing.assert_allclose(out, expected)
def fdmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None): opt = _setup_opt(opt, is_cpu=True) # Parameter validation if v is None and w is None: raise ValueError("One of v and w must be specified to run fMMV.") T = v.size(1) if v is not None else w.size(1) ntot, dtot = X1.size() M = X2.size(0) dtype = X1.dtype # Create output matrix if out is None: out = torch.empty(M, T, dtype=dtype) out.fill_(0) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # Narrow X1 : n # ker_chunk : n*M # w_blk : n*T n = avail_mem / (M * T + 1) n = int(math.floor(n)) if n < 1: raise MemoryError(("Available memory %.2fGB is insufficient " "for blockwise fdMMv.") % (avail_mem * sizeof_dtype(dtype) / 2**30)) # Allocate fixed arrays ker_chunk = create_same_stride((n, M), out, dtype, device='cpu') w_blk = create_same_stride((n, T), out, dtype, device='cpu') # Run blocked fdmmv for i in range(0, ntot, n): ic = min(n, ntot - i) X1_chunk = X1.narrow_rows(i, ic) cur_ker_chunk = ker_chunk[:ic] cur_ker_chunk.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2) kernel._apply_sparse(X1_chunk, X2.transpose_csc(), cur_ker_chunk) kernel._finalize(cur_ker_chunk, ddd) # Multiply by the vector v cur_w_blk = w_blk[:ic] # n x T cur_w_blk.fill_(0.0) if w is not None: cur_w_blk.copy_(w[i:i + ic, :]) if v is not None: # w_blk + c_out * v => (n x T) + (n x M)*(M x T) cur_w_blk.addmm_(cur_ker_chunk, v) out.addmm_(cur_ker_chunk.T, cur_w_blk) del ker_chunk, w_blk return out
def test_cuda_matmul(self, mat1, mat2, expected): dev = torch.device("cuda:0") out = create_fortran(expected.shape, expected.dtype, dev) mat1_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix(mat1)).to(device=dev) mat2_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix(mat2)).to(device=dev) sparse_matmul(mat1_csr, mat2_csr, out) torch.testing.assert_allclose(out.cpu(), expected)
def fdmmv_cuda_sparse(X1: SparseTensor, X2: SparseTensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: opt = _setup_opt(opt) _check_contiguity((v, 'v'), (w, 'w'), (out, 'out')) if v is None and w is None: raise ValueError("one of 'v' or 'w' must not be None.") T = v.size(1) if v is not None else w.size(1) M = X2.size(0) N = X1.size(0) gpu_info = _get_gpu_info(opt, slack=0.95) block_sizes = calc_gpu_block_sizes(gpu_info, N) if out is None: out = create_C((M, T), X1.dtype, 'cpu', pin_memory=True) wrlk = [] # outputs for each subprocess. args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue cur_out_gpu = create_C((M, T), X1.dtype, f'cuda:{gpu_info[i].Id}') # M x T wrlk.append(cur_out_gpu) cur_w = None if w is not None: cur_w = w.narrow(0, block_sizes[i], bwidth) args.append((ArgsFdmmv(X1=X1.narrow_rows(block_sizes[i], bwidth), X2=X2, v=v, w=cur_w, out=cur_out_gpu, kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(sparse_fdmmv, args) if len(wrlk) > 1: # noinspection PyTypeChecker fastest_device: int = np.argmax([d.speed for d in gpu_info]) out.copy_( torch.cuda.comm.reduce_add( wrlk, destination=gpu_info[fastest_device].Id)) else: out.copy_(wrlk[0]) return out
def test_cpu_matmul_wrong_format(self, mat1, mat2, expected): out = torch.empty_like(expected) mat1_csr = SparseTensor.from_scipy(scipy.sparse.csr_matrix(mat1)) mat2_csr = SparseTensor.from_scipy(scipy.sparse.csr_matrix(mat2)) with pytest.raises(ValueError) as exc_info: sparse_matmul(mat1_csr, mat2_csr, out) assert str(exc_info.value).startswith("B must be CSC matrix") mat1_csc = SparseTensor.from_scipy(scipy.sparse.csc_matrix(mat1)) with pytest.raises(ValueError) as exc_info: sparse_matmul(mat1_csc, mat2_csr, out) assert str(exc_info.value).startswith("A must be CSR matrix")
def test_empty(self): device = 'cpu' indexptr = torch.tensor([0, 1, 1, 1, 3, 4], dtype=torch.long, device=device) index = torch.tensor([1, 0, 1, 0], dtype=torch.long, device=device) value = torch.tensor([2, 1, 3, 4], dtype=torch.float32, device=device) arr = SparseTensor(indexptr=indexptr, index=index, data=value, size=(5, 2), sparse_type="csr") arr_small = arr.narrow_rows(1, 2) sm_coo = arr_small.to_scipy().tocoo() self.assertEqual(sm_coo.row.tolist(), []) self.assertEqual(sm_coo.col.tolist(), []) self.assertEqual(sm_coo.data.tolist(), [])
def test_check_sparse(): smat = scipy.sparse.csr_matrix( np.array([[0, 1], [0, 1]]).astype(np.float32)) st = SparseTensor.from_scipy(smat) assert [False, True] == check_sparse(torch.tensor(0), st) assert [] == check_sparse()
def test_check_same_dtype_equal(): smat = scipy.sparse.csr_matrix( np.array([[0, 1], [0, 1]]).astype(np.float32)) ts = [ torch.tensor(0, dtype=torch.float32), SparseTensor.from_scipy(smat), None ] assert check_same_dtype(*ts) is True
def fmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor], opt: BaseOptions): opt = _setup_opt(opt, is_cpu=True) dtype = X1.dtype ntot, dtot = X1.size() mtot, T = v.size() # Create output matrix if out is None: out = torch.empty(ntot, T, dtype=dtype) out.fill_(0.0) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # Narrowing X1, X2: n + m # Prepare - not computable, depends on kernel # ker_chunk : n*m # finalize : 0 (if can be implemented in place, kernel-dependent) n, m = select_dim_over_m(maxM=mtot, maxN=ntot, coef_nm=1, coef_n=1, coef_m=1, tot=avail_mem) ker_chunk = create_same_stride((n, m), out, dtype, device='cpu') for i in range(0, ntot, n): ic = min(n, ntot - i) cur_out = out[i:i + ic, :] X1_chunk = X1.narrow_rows(i, ic) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) cur_ker_chunk = ker_chunk[:ic, :jc] cur_ker_chunk.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) kernel._apply_sparse(X1_chunk, X2_chunk.transpose_csc(), cur_ker_chunk) kernel._finalize(cur_ker_chunk, ddd) # Multiply by the vector v cur_out.addmm_(cur_ker_chunk, v.narrow(0, j, jc)) return out
def csc_mat() -> SparseTensor: indexptr = torch.tensor([0, 2, 3, 6], dtype=torch.long) index = torch.tensor([0, 2, 2, 0, 1, 2], dtype=torch.long) value = torch.tensor([1, 2, 3, 4, 5, 6], dtype=torch.float32) return SparseTensor(indexptr=indexptr, index=index, data=value, size=(3, 3), sparse_type="csc")
def test_check_same_dtype_notequal(): smat32 = scipy.sparse.csr_matrix( np.array([[0, 1], [0, 1]]).astype(np.float32)) smat64 = scipy.sparse.csr_matrix( np.array([[0, 1], [0, 1]]).astype(np.float64)) ts = [ torch.tensor(0, dtype=torch.float32), torch.tensor(0, dtype=torch.float64), SparseTensor.from_scipy(smat32), ] assert check_same_dtype(*ts) is False ts = [ torch.tensor(0, dtype=torch.float32), SparseTensor.from_scipy(smat32), SparseTensor.from_scipy(smat64), ] assert check_same_dtype(*ts) is False
def test_matmul_zeros(self, mat1, mat2, expected, device): mat1_zero_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix( torch.zeros_like(mat1).numpy())).to(device=device) mat2_csc = SparseTensor.from_scipy( scipy.sparse.csc_matrix(mat2.numpy())).to(device=device) out = torch.empty_like(expected).to(device) sparse_matmul(mat1_zero_csr, mat2_csc, out) assert torch.all(out == 0.0) mat1_csr = SparseTensor.from_scipy( scipy.sparse.csr_matrix(mat1.numpy())).to(device=device) mat2_zero_csc = SparseTensor.from_scipy( scipy.sparse.csc_matrix( torch.zeros_like(mat2).numpy())).to(device=device) out = torch.empty_like(expected).to(device=device) sparse_matmul(mat1_csr, mat2_zero_csc, out) assert torch.all(out == 0.0)
def gen_sparse_matrix(a, b, dtype, density=0.1, seed=0) -> SparseTensor: out = random_sparse(a, b, density=density, format='csr', dtype=dtype, seed=seed) return SparseTensor.from_scipy(out)
def select(self, X: _tensor_type, Y: Union[torch.Tensor, None], M: int) -> Union[_tensor_type, Tuple[_tensor_type, torch.Tensor]]: """Select M observations from 2D tensor `X`, preserving device and memory order. The selection strategy is uniformly at random. To control the randomness, pass an appropriate numpy random generator to this class's constructor. Parameters ---------- X N x D tensor containing the whole input dataset. We have that N <= M. Y Optional N x T tensor containing the input targets. If `Y` is provided, the same observations selected for `X` will also be selected from `Y`. Certain models (such as :class:`falkon.models.LogisticFalkon`) require centers to be extracted from both predictors and targets, while others (such as :class:`falkon.models.Falkon`) only require the centers from the predictors. M The number of observations to choose. M <= N, otherwise M is forcibly set to N with a warning. Returns ------- X_M The randomly selected centers. They will be in a new, memory-contiguous tensor. All characteristics of the input tensor will be preserved. (X_M, Y_M) If `Y` was different than `None` then the entries of `Y` corresponding to the selected centers of `X` will also be returned. """ N = X.shape[0] if M > N: warnings.warn("Number of centers M greater than the " "number of data-points. Setting M to %d" % (N)) M = N idx = self.random_gen.choice(N, size=M, replace=False) if isinstance(X, SparseTensor): X = X.to_scipy() centers = X[idx, :].copy() Xc = SparseTensor.from_scipy(centers) else: Xc = create_same_stride((M, X.shape[1]), other=X, dtype=X.dtype, device=X.device, pin_memory=False) th_idx = torch.from_numpy(idx.astype(np.long)).to(X.device) torch.index_select(X, dim=0, index=th_idx, out=Xc) if Y is not None: Yc = create_same_stride((M, Y.shape[1]), other=Y, dtype=Y.dtype, device=Y.device, pin_memory=False) th_idx = torch.from_numpy(idx.astype(np.long)).to(Y.device) torch.index_select(Y, dim=0, index=th_idx, out=Yc) return Xc, Yc return Xc
def csr_mat() -> SparseTensor: """ - 2 1 3 4 - """ indexptr = torch.tensor([0, 1, 3, 4], dtype=torch.long) index = torch.tensor([1, 0, 1, 0], dtype=torch.long) value = torch.tensor([2, 1, 3, 4], dtype=torch.float32) return SparseTensor(indexptr=indexptr, index=index, data=value, size=(3, 2), sparse_type="csr")
def sparse_matmul(A: SparseTensor, B: SparseTensor, out: torch.Tensor) -> torch.Tensor: """Sparse*Sparse matrix multiplication. Output will be copied into dense `out` matrix. This function can be applied to CPU or CUDA tensors (but all tensors must be consistently on the same device). Note that the CUDA matrix multiplication is Parameters ---------- A : SparseTensor N x D, sparse matrix. B : SparseTensor D x M, sparse matrix """ if A.nnz() == 0 or B.nnz() == 0: return out if A.is_cuda: return _sparse_matmul_cuda(A, B, out) else: return _sparse_matmul_cpu(A, B, out)
def select( self, X: _tensor_type, Y: Union[torch.Tensor, None], M: int) -> Union[_tensor_type, Tuple[_tensor_type, torch.Tensor]]: """Select M rows from 2D array `X`, preserving the memory order of `X`. """ N = X.size(0) if M > N: warnings.warn("Number of centers M greater than the " "number of data-points. Setting M to %d" % (N)) M = N idx = self.random_gen.choice(N, size=M, replace=False) if isinstance(X, SparseTensor): X = X.to_scipy() centers = X[idx, :].copy() Xc = SparseTensor.from_scipy(centers) else: Xnp = X.numpy() # work on np array if is_f_contig(X): order = 'F' else: order = 'C' Xc_np = np.empty((M, Xnp.shape[1]), dtype=Xnp.dtype, order=order) Xc = torch.from_numpy( np.take(Xnp, idx, axis=0, out=Xc_np, mode='wrap')) if Y is not None: Ynp = Y.numpy() # work on np array if is_f_contig(X): order = 'F' else: order = 'C' Yc_np = np.empty((M, Ynp.shape[1]), dtype=Ynp.dtype, order=order) Yc = torch.from_numpy( np.take(Ynp, idx, axis=0, out=Yc_np, mode='wrap')) return Xc, Yc return Xc
def _sparse_fmm(proc_idx, queue, device_id): a: ArgsFmm = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem ntot, dtot = X1.shape mtot = X2.size(0) avail_mem = max_mem / sizeof_dtype(gpu_dtype) # Memory usage: # X1_chunk : ntot + 2 * D * ntot * density # X2_chunk : dtot + 2 * D * mtot * density (because is transposed) # sparse_out : ntot + 2 * ntot * mtot * density (assume density=1 here) # ker_gpu : mtot * ntot n, m = select_dim_over_nm_v2(max_n=ntot, max_m=mtot, coef_nm=3, coef_n=2 + 2 * dtot * X1.density, coef_m=2 * dtot * X2.density, rest=dtot, max_mem=avail_mem) tc_device = torch.device('cuda:%d' % (int(device_id))) with torch.cuda.device(tc_device): # Initialize GPU buffers g_out = create_same_stride((n, m), out, gpu_dtype, tc_device) cpu_buf = None if X1.dtype != gpu_dtype: cpu_buf = create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc).to(dtype=gpu_dtype) X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=tc_device) for i in range(0, ntot, n): ic = min(n, ntot - i) X1_chunk = X1.narrow_rows(i, ic).to(dtype=gpu_dtype) X1_chunk_d = X1_chunk.index_to_int().to(device=tc_device) cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc) cur_g_out.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) cur_g_out = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_g_out) cur_g_out = kernel._finalize(cur_g_out, ddd) copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j, cpu_buf) del ddd, X1_chunk_d, X1_chunk del X2_chunk, X2_chunk_d del g_out return out
def sparse_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, out = a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype cuda_inputs = X1.is_cuda ntot, dtot = X1.shape mtot, T = v.size() avail_mem = max_mem / sizeof_dtype(dtype) # Memory needs: # X1_chunk : N + 2*D*N*density # X2_chunk : D + 2*D*M*density (because is transposed) # sparse_out : N + 2*N*M*(density) (assume density = 1) # ker_gpu : M*N # mmv_gpu : N*T # v_gpu : M*T # Other: GPU buffer n, m = select_dim_over_nm_v2(max_n=ntot, max_m=mtot, coef_nm=3, coef_n=2 + 2 * dtot * X1.density + T, coef_m=2 * dtot * X2.density + T, rest=dtot, max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # First collect necessary memory mem_needed = mtot * T + n * T + n * m # Create flat tensor flat_gpu_tn = torch.empty(size=(mem_needed, ), dtype=dtype, device=ddev) # Extract the sub-tensors flat_offset = 0 v_gpu = extract_same_stride(flat_gpu_tn, size=(mtot, T), other=v, offset=flat_offset) flat_offset += np.prod(v_gpu.shape) copy_to_device_noorder(mtot, T, v, 0, 0, v_gpu, 0, 0) mmv_gpu = extract_same_stride(flat_gpu_tn, size=(n, T), other=out, offset=flat_offset) flat_offset += np.prod(mmv_gpu.shape) # ker_gpu should be fortran-ordered due to cusparse csr2dense function ker_gpu = extract_fortran(flat_gpu_tn, size=(n, m), offset=flat_offset) flat_offset += np.prod(ker_gpu.shape) for i in range(0, ntot, n): ic = min(n, ntot - i) cur_mmv_gpu = mmv_gpu[:ic] # n x T cur_mmv_gpu.fill_(0.0) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) # Prepare sparse on CPU ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) # Transpose X2-chunk and convert it to CSR. This uses lots of RAM X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) cur_ker_gpu = ker_gpu[:ic, :jc] cur_ker_gpu.fill_(0.0) # Run the matrix multiplication (kernel apply) cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_ker_gpu) cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd) # Multiply by the vector v cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc)) del ddd, X2_chunk, X2_chunk_d # send result to CPU if not cuda_inputs: copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0) del X1_chunk, X1_chunk_d return out
def sparse_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, out = a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype ntot, dtot = X1.shape mtot, T = v.size() avail_mem = max_mem / sizeof_dtype(dtype) # Memory needs: # X1_chunk : N + 2*D*N*density # X2_chunk : D + 2*D*M*density (because is transposed) # sparse_out : N + 2*N*M*(density) (assume density = 1) # ker_gpu : M*N # mmv_gpu : N*T # v_gpu : M*T # Other: GPU buffer n, m = select_dim_over_m( maxM=mtot, maxN=ntot, tot=avail_mem, coef_nm=3, coef_n=2 + 2 * dtot * X1.density + T, coef_m=2 * dtot * X2.density + T, rest=dtot, ) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): v_gpu = v.to(device=ddev) # M x T mmv_gpu = create_same_stride((n, T), out, dtype, ddev) # ker_gpu should be fortran-ordered due to cusparse csr2dense function ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev) for i in range(0, ntot, n): ic = min(n, ntot - i) cur_mmv_gpu = mmv_gpu[:ic] # n x T cur_mmv_gpu.fill_(0.0) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) # Prepare sparse on CPU ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) # Transpose X2-chunk and convert it to CSR. This uses lots of RAM X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) cur_ker_gpu = ker_gpu[:ic, :jc] cur_ker_gpu.fill_(0.0) # Run the matrix multiplication (kernel apply) cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_ker_gpu) cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd) # Multiply by the vector v cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc)) del ddd, X2_chunk, X2_chunk_d # send result to CPU copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0) del X1_chunk, X1_chunk_d return out
def sparse_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, w, out = a.v, a.w, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype N, D = X1.shape M = X2.size(0) if v is None: T = w.size(1) else: T = v.size(1) # Memory needs: # X1_chunk : ntot + 2 * D * ntot * density # X2 : dtot + 2 * D * M * density (because is transposed) # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1) # ker_gpu : M * ntot # w_gpu : ntot * T # v_gpu : M * T # out_gpu : M * T avail_mem = max_mem / sizeof_dtype(dtype) den = 2 * D * X1.density + 2 + 3 * M + T sub = D + 2 * D * M * X2.density + M * T if v is not None: sub += M * T n = (avail_mem - sub) / den n = min(int(n), N) if n < 1: raise MemoryError("Not enough memory to run sparse dfmmv") ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # Initialize GPU data w_gpu = create_same_stride((n, T), out, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) ker_gpu = create_fortran((n, M), dtype, ddev) if v is not None: v_gpu = v.to(device=ddev) # M x T X2_d = SparseTensor.from_scipy( X2.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) for i in range(0, N, n): ic = min(n, N - i) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) ker_chunk = ker_gpu[:ic] ker_chunk.fill_(0.0) # TODO: This is wasteful (X2 will be prepared many times over) ddd = kernel._prepare_sparse(X1_chunk, X2) ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk) ker_chunk = kernel._finalize(ker_chunk, ddd) if w is not None: c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0) else: c_g_w = w_gpu.narrow(0, 0, ic) c_g_w.fill_(0.0) if v is not None: c_g_w.addmm_(ker_chunk, v_gpu) out_gpu.addmm_(ker_chunk.T, c_g_w) del ddd, X1_chunk, X1_chunk_d if not out.is_cuda: copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def mkl_export_sparse(self, mkl_mat: sparse_matrix_t, dtype: torch.dtype, output_type: str = "csr") -> SparseTensor: """Create a :class:`SparseTensor` from a MKL sparse matrix holder. Note that not all possible MKL sparse matrices are supported (for example if 1-based indexing is used, or for non floating-point types), but those created with :meth:`mkl_create_sparse_from_scipy` and :meth:`mkl_create_sparse` are. Parameters ----------- mkl_mat The MKL sparse matrix holder dtype The data-type of the matrix. This must match the data-type of the data stored in the MKL matrix (no type conversion is performed), otherwise garbage data or memory corruption could occur. output_type Whether the matrix should be interpreted as CSR (pass ``"csr"``) or CSC (pass ``"csc"``). This should match the MKL matrix, otherwise a transposed output may be produced. Returns -------- The :class:`SparseTensor` object, sharing the same data arrays as the MKL matrix. Notes ------ Depending on the integer type of the linked MKL version, the indices of the matrix may be copied. In any case the output tensor will use :class:`torch.int64` indices. """ indptrb = ctypes.POINTER(self.MKL_INT)() indptren = ctypes.POINTER(self.MKL_INT)() indices = ctypes.POINTER(self.MKL_INT)() ordering = ctypes.c_int() nrows = self.MKL_INT() ncols = self.MKL_INT() if output_type.lower() == "csr": if dtype == torch.float64: fn = self.libmkl.mkl_sparse_d_export_csr ctype = ctypes.c_double elif dtype == torch.float32: fn = self.libmkl.mkl_sparse_s_export_csr ctype = ctypes.c_float else: raise TypeError("Data type %s not valid to export" % (dtype)) elif output_type.lower() == "csc": if dtype == torch.float64: fn = self.libmkl.mkl_sparse_d_export_csc ctype = ctypes.c_double elif dtype == torch.float32: fn = self.libmkl.mkl_sparse_s_export_csc ctype = ctypes.c_float else: raise TypeError("Data type %s not valid to export" % (dtype)) else: raise ValueError("Output type %s not valid" % (output_type)) data_ptr = ctypes.POINTER(ctype)() ret_val = fn(mkl_mat, ctypes.byref(ordering), ctypes.byref(nrows), ctypes.byref(ncols), ctypes.byref(indptrb), ctypes.byref(indptren), ctypes.byref(indices), ctypes.byref(data_ptr)) Mkl.mkl_check_return_val(ret_val, fn) if ordering.value != 0: raise ValueError("1-based indexing (F-style) is not supported") ncols = ncols.value nrows = nrows.value # Get the index dimension index_dim = nrows if output_type == "csr" else ncols # Construct a numpy array and add 0 to first position for scipy.sparse's 3-array indexing indptrb = as_array(indptrb, shape=(index_dim, )) indptren = as_array(indptren, shape=(index_dim, )) indptren = np.insert(indptren, 0, indptrb[0]) nnz = indptren[-1] - indptrb[0] # Construct numpy arrays from data pointer and from indicies pointer data = np.array(as_array(data_ptr, shape=(nnz, )), copy=True) indices = np.array(as_array(indices, shape=(nnz, )), copy=True) return SparseTensor(indexptr=torch.from_numpy(indptren).to(torch.long), index=torch.from_numpy(indices).to(torch.long), data=torch.from_numpy(data), size=(nrows, ncols), sparse_type=output_type.lower())