def test_dim_over_nm_v2_zero(self, avail_mem): tot_n = 400_000 tot_m = 2_000 n, m = select_dim_over_nm_v2(tot_n, tot_m, coef_nm=0, coef_n=0, coef_m=0, rest=0, max_mem=avail_mem) created = 0 do_check(created, avail_mem, n, tot_n, m, tot_m) n, m = select_dim_over_nm_v2(tot_n, tot_m, coef_nm=1.3, coef_n=0, coef_m=0, rest=9890, max_mem=avail_mem) created = 1.3 * n * m + 9890 do_check(created, avail_mem, n, tot_n, m, tot_m) n, m = select_dim_over_nm_v2(tot_n, tot_m, coef_nm=0, coef_n=2.0, coef_m=0, rest=9890, max_mem=avail_mem) created = 2 * n + 9890 do_check(created, avail_mem, n, tot_n, m, tot_m)
def test_dim_over_nm_v2_notenough(self, avail_mem): tot_n = 40_000 tot_m = 2_000 tot_d = 30_720 tot_t = 10 with pytest.raises(MemoryError): select_dim_over_nm_v2(tot_n, tot_m, coef_nm=1.0, coef_n=tot_d + tot_t, coef_m=tot_d + tot_t, rest=0, max_mem=avail_mem) with pytest.raises(MemoryError): select_dim_over_nm_v2(tot_n, tot_m, coef_nm=0.1, coef_n=0, coef_m=0, rest=12312, max_mem=avail_mem)
def test_dim_over_nm_v2(self, avail_mem): tot_n = 40_000 tot_m = 2_000 tot_d = 30_720 tot_t = 10 n, m = select_dim_over_nm_v2(tot_n, tot_m, coef_nm=1.0, coef_n=tot_d + tot_t, coef_m=tot_d + tot_t, rest=0, max_mem=avail_mem) created = n * m + n * tot_t + n * tot_d + m * tot_d + m * tot_t do_check(created, avail_mem, n, tot_n, m, tot_m)
def fmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor], opt: BaseOptions): opt = _setup_opt(opt, is_cpu=True) dtype = X1.dtype ntot, dtot = X1.size() mtot, T = v.size() # Create output matrix if out is None: out = torch.empty(ntot, T, dtype=dtype) out.fill_(0.0) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # Narrowing X1, X2: n + m # Prepare - not computable, depends on kernel # ker_chunk : n*m # finalize : 0 (if can be implemented in place, kernel-dependent) n, m = select_dim_over_nm_v2(max_n=ntot, max_m=mtot, coef_nm=1, coef_n=1, coef_m=1, rest=0, max_mem=avail_mem) ker_chunk = create_same_stride((n, m), out, dtype, device='cpu') for i in range(0, ntot, n): ic = min(n, ntot - i) cur_out = out[i:i + ic, :] X1_chunk = X1.narrow_rows(i, ic) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) cur_ker_chunk = ker_chunk[:ic, :jc] cur_ker_chunk.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) kernel._apply_sparse(X1_chunk, X2_chunk.transpose_csc(), cur_ker_chunk) kernel._finalize(cur_ker_chunk, ddd) # Multiply by the vector v cur_out.addmm_(cur_ker_chunk, v.narrow(0, j, jc)) return out
def _sparse_fmm(proc_idx, queue, device_id): a: ArgsFmm = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem ntot, dtot = X1.shape mtot = X2.size(0) avail_mem = max_mem / sizeof_dtype(gpu_dtype) # Memory usage: # X1_chunk : ntot + 2 * D * ntot * density # X2_chunk : dtot + 2 * D * mtot * density (because is transposed) # sparse_out : ntot + 2 * ntot * mtot * density (assume density=1 here) # ker_gpu : mtot * ntot n, m = select_dim_over_nm_v2(max_n=ntot, max_m=mtot, coef_nm=3, coef_n=2 + 2 * dtot * X1.density, coef_m=2 * dtot * X2.density, rest=dtot, max_mem=avail_mem) tc_device = torch.device('cuda:%d' % (int(device_id))) with torch.cuda.device(tc_device): # Initialize GPU buffers g_out = create_same_stride((n, m), out, gpu_dtype, tc_device) cpu_buf = None if X1.dtype != gpu_dtype: cpu_buf = create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc).to(dtype=gpu_dtype) X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=tc_device) for i in range(0, ntot, n): ic = min(n, ntot - i) X1_chunk = X1.narrow_rows(i, ic).to(dtype=gpu_dtype) X1_chunk_d = X1_chunk.index_to_int().to(device=tc_device) cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc) cur_g_out.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) cur_g_out = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_g_out) cur_g_out = kernel._finalize(cur_g_out, ddd) copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j, cpu_buf) del ddd, X1_chunk_d, X1_chunk del X2_chunk, X2_chunk_d del g_out return out
def sparse_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, out = a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype cuda_inputs = X1.is_cuda ntot, dtot = X1.shape mtot, T = v.size() avail_mem = max_mem / sizeof_dtype(dtype) # Memory needs: # X1_chunk : N + 2*D*N*density # X2_chunk : D + 2*D*M*density (because is transposed) # sparse_out : N + 2*N*M*(density) (assume density = 1) # ker_gpu : M*N # mmv_gpu : N*T # v_gpu : M*T # Other: GPU buffer n, m = select_dim_over_nm_v2(max_n=ntot, max_m=mtot, coef_nm=3, coef_n=2 + 2 * dtot * X1.density + T, coef_m=2 * dtot * X2.density + T, rest=dtot, max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # First collect necessary memory mem_needed = mtot * T + n * T + n * m # Create flat tensor flat_gpu_tn = torch.empty(size=(mem_needed, ), dtype=dtype, device=ddev) # Extract the sub-tensors flat_offset = 0 v_gpu = extract_same_stride(flat_gpu_tn, size=(mtot, T), other=v, offset=flat_offset) flat_offset += np.prod(v_gpu.shape) copy_to_device_noorder(mtot, T, v, 0, 0, v_gpu, 0, 0) mmv_gpu = extract_same_stride(flat_gpu_tn, size=(n, T), other=out, offset=flat_offset) flat_offset += np.prod(mmv_gpu.shape) # ker_gpu should be fortran-ordered due to cusparse csr2dense function ker_gpu = extract_fortran(flat_gpu_tn, size=(n, m), offset=flat_offset) flat_offset += np.prod(ker_gpu.shape) for i in range(0, ntot, n): ic = min(n, ntot - i) cur_mmv_gpu = mmv_gpu[:ic] # n x T cur_mmv_gpu.fill_(0.0) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) # Prepare sparse on CPU ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) # Transpose X2-chunk and convert it to CSR. This uses lots of RAM X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) cur_ker_gpu = ker_gpu[:ic, :jc] cur_ker_gpu.fill_(0.0) # Run the matrix multiplication (kernel apply) cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_ker_gpu) cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd) # Multiply by the vector v cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc)) del ddd, X2_chunk, X2_chunk_d # send result to CPU if not cuda_inputs: copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0) del X1_chunk, X1_chunk_d return out
def distk_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1, X2, v, out = a.X1, a.X2, a.v, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.shape M = X2.shape[0] T = v.shape[1] dtype = X1.dtype cuda_inputs = X1.is_cuda # GPU memory usage: # X1s : n x D # X2s : m x D # vs : m x T # nm : n x m # out : n x T # ----------- # total: n*m + n * (D + T) + m * (D + T) = R avail_mem = max_mem / sizeof_dtype(dtype) n, m = select_dim_over_nm_v2(max_n=N, max_m=M, coef_nm=1, coef_n=D + T, coef_m=D + T, rest=0, max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): mem_needed = n * m if not cuda_inputs: mem_needed += n * T + n * D + m * D + m * T flat_gpu_tn = torch.empty(size=(mem_needed, ), dtype=dtype, device=ddev) flat_offset = 0 nm_gpu = extract_same_stride(flat_gpu_tn, size=(n, m), other=X1, offset=flat_offset) flat_offset += np.prod(nm_gpu.shape) if not cuda_inputs: out_gpu = extract_same_stride(flat_gpu_tn, size=(n, T), other=out, offset=flat_offset) flat_offset += np.prod(out_gpu.shape) X1s_gpu = extract_same_stride(flat_gpu_tn, size=(n, D), other=X1, offset=flat_offset) flat_offset += np.prod(X1s_gpu.shape) X2s_gpu = extract_same_stride(flat_gpu_tn, size=(m, D), other=X2, offset=flat_offset) flat_offset += np.prod(X2s_gpu.shape) vs_gpu = extract_same_stride(flat_gpu_tn, size=(m, T), other=v, offset=flat_offset) flat_offset += np.prod(vs_gpu.shape) for i in range(0, N, n): nb = min(n, N - i) if cuda_inputs: cur_X1s_gpu = X1.narrow(0, i, nb) # n x D else: cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu, 0, 0) sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2) if cuda_inputs: cur_out_gpu = out.narrow(0, i, nb) # n x T else: cur_out_gpu = out_gpu.narrow(0, 0, nb) # n x T cur_out_gpu.fill_(0.0) for j in range(0, M, m): mb = min(m, M - j) if cuda_inputs: cur_X2s_gpu = X2.narrow(0, j, mb) # m x D cur_vs_gpu = v.narrow(0, j, mb) # m x T else: cur_X2s_gpu = copy_to_device_noorder( mb, D, X2, j, 0, X2s_gpu, 0, 0) # m x D cur_vs_gpu = copy_to_device_noorder( mb, T, v, j, 0, vs_gpu, 0, 0) # m x T cur_nm_gpu = nm_gpu[:nb, :mb] # n x m sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2) torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu) cur_nm_gpu.mul_(-2.0) cur_nm_gpu.add_(sq1) cur_nm_gpu.add_(sq2.T) cur_nm_gpu.clamp_min_(0) kernel._transform(cur_nm_gpu) # Multiply by the vector v cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu) # n x T if not cuda_inputs: # send result to CPU copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0) return out