def test_dim_over_nd_zero(self, avail_mem): tot_d = 1231 tot_n = 3700000 n, d = select_dim_over_nd(max_n=tot_n, max_d=tot_d, coef_nd=0, coef_n=0, coef_d=0, rest=0, max_mem=avail_mem) created = 0 do_check(created, avail_mem, n, tot_n, d, tot_d) n, d = select_dim_over_nd(max_n=tot_n, max_d=tot_d, coef_nd=0, coef_n=34, coef_d=20, rest=0, max_mem=avail_mem) created = n * 34 + d * 20 do_check(created, avail_mem, n, tot_n, d, tot_d) n, d = select_dim_over_nd(max_n=tot_n, max_d=tot_d, coef_nd=0, coef_n=324, coef_d=0, rest=0, max_mem=avail_mem) created = n * 324 do_check(created, avail_mem, n, tot_n, d, tot_d)
def fmmv_cpu(X1, X2, v, kernel, out, opt): """Blockwise kernel-vector product This function computes ``kernel(X1, X2) @ v`` in a blockwise fashion, to avoid having the whole N*M kernel matrix in memory at once. Note that while the principle is that of matrix-vector product, `v` can have more than one column. Parameters ----------- X1 [N, D] array X2 [M, D] array v [M, T] array kernel Class representing the desired kernel function out : torch.Tensor or None [N, T] array for storing the kernel-vector product output. If None, will be allocated within the function. opt Basic options dictionary, used for determining available memory. """ opt = _setup_opt(opt, is_cpu=True) ntot, dtot = X1.size(0), X1.size(1) M, T = v.size() dtype = v.dtype # Create output matrix if out is None: out = torch.empty(ntot, T, dtype=dtype) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # Only necessary memory allocation is that for the temporary kernel # `temp_out` of size n*M extra_mem = kernel.extra_mem() n, d = select_dim_over_nd(max_n=ntot, max_d=dtot, coef_nd=extra_mem.get('nd', 0), coef_n=M + extra_mem.get('n', 0) + extra_mem.get('nm', 0) * M, coef_d=extra_mem.get('d', 0) + extra_mem.get('md', 0) * M, rest=extra_mem.get('m', 0), max_mem=avail_mem) # Run batched matrix multiplication for i in range(0, ntot, n): ic = min(n, ntot - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) # , v=v) temp_out = torch.zeros(ic, M, dtype=dtype) for k in range(0, dtot, d): kc = min(d, dtot - k) X1d = X1[i: i + ic, k: k + kc] X2d = X2[:, k: k + kc] kernel._apply(X1d, X2d.T, temp_out) # temp_out = fnc(X1*X2', X1, X2) kernel._finalize(temp_out, ddd) torch.mm(temp_out, v, out=out[i: i + ic, :]) return out
def test_dim_over_nd_notenough(self, avail_mem): tot_d = 1231 tot_n = 3700000 with pytest.raises(MemoryError): select_dim_over_nd(max_n=tot_n, max_d=tot_d, coef_nd=1.2, coef_n=1.3, coef_d=1.4, rest=98765, max_mem=avail_mem) with pytest.raises(MemoryError): select_dim_over_nd(max_n=tot_n, max_d=tot_d, coef_nd=avail_mem, coef_n=0, coef_d=0, rest=1, max_mem=avail_mem) with pytest.raises(MemoryError): select_dim_over_nd(max_n=tot_n, max_d=tot_d, coef_nd=0, coef_n=0.01, coef_d=0, rest=98765, max_mem=avail_mem)
def test_dim_over_nd(self, avail_mem): tot_d = 1231 tot_n = 3700000 n, d = select_dim_over_nd(max_n=tot_n, max_d=tot_d, coef_nd=1.2, coef_n=1.3, coef_d=1.4, rest=98765, max_mem=avail_mem) created = n * d * 1.2 + n * 1.3 + d * 1.4 + 98765 do_check(created, avail_mem, n, tot_n, d, tot_d) n, d = select_dim_over_nd(max_n=tot_n, max_d=tot_d, coef_nd=20, coef_n=1.3, coef_d=1.4, rest=987650, max_mem=avail_mem) created = n * d * 20 + n * 1.3 + d * 1.4 + 987650 do_check(created, avail_mem, n, tot_n, d, tot_d)
def fdmmv_cpu(X1, X2, v, w, kernel, out, opt): """Calculate a double kernel-vector product. This function computes the following quantity: ``kernel(X1, X2).T @ (kernel(X1, X2) @ v + w)`` Where one of `v` or `w` can be empty. All arrays passed to this function must be 2-dimensional, although the second dimension can be unitary. The expression is not computed directly. We separate the computation into smaller blocks so as to reduce the total memory consumption (the large N*M kernel matrix is never wholly stored in RAM.) Parameters ----------- X1 [N, D] array X2 [M, D] array v : torch.Tensor or None [M, T] array. But note that at least one of v or w must be specified. w : torch.Tensor or None [N, T] array. But note that at least one of v or w must be specified. kernel Class representing the desired kernel function out : torch.Tensor or None [M, T] array for storing the kernel-vector product output. If None, will be allocated within the function. opt Basic options dictionary, used for determining available memory. """ opt = _setup_opt(opt, is_cpu=True) # Parameter validation if v is None and w is None: raise ValueError("One of v and w must be specified to run fMMV.") T = v.shape[1] if v is not None else w.shape[1] ntot, dtot = X1.size() M = X2.size(0) dtype = X1.dtype # Create output matrix if out is None: out = torch.empty(M, T, dtype=dtype) out.fill_(0) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # The only necessary temporary matrices are: `temp_out` of size n*M and # temp_w_block of size n*T extra_mem = kernel.extra_mem() n, d = select_dim_over_nd(max_n=ntot, max_d=dtot, coef_nd=extra_mem.get('nd', 0), coef_n=M + T + extra_mem.get('n', 0) + extra_mem.get('nm', 0) * M, coef_d=extra_mem.get('d', 0) + extra_mem.get('md', 0) * M, rest=extra_mem.get('m', 0), max_mem=avail_mem) # Run Batched Matrix Computation for i in range(0, ntot, n): ic = min(n, ntot - i) ddd = kernel._prepare(X1[i: i + ic, :], X2) temp_out = torch.zeros(ic, M, dtype=dtype) for k in range(0, dtot, d): kc = min(d, dtot - k) X1d = X1[i: i + ic, k: k + kc] X2d = X2[:, k: k + kc] kernel._apply(X1d, X2d.T, temp_out) kernel._finalize(temp_out, ddd) # fnc(X1*X2', X1, X2) [n x M] w_blk = torch.zeros(ic, T, dtype=dtype) # n x T if w is not None: w_blk.copy_(w[i: i + ic, :]) if v is not None: # w_blk + c_out * v => (n x T) + (n x M)*(M x T) w_blk.addmm_(temp_out, v) out.add_(torch.mm(temp_out.T, w_blk)) return out
def distk_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.size() M = X2.size(0) T = v.shape[1] if v is not None else w.shape[1] dtype = X1.dtype cuda_inputs = X1.is_cuda # Memory usage: # v : M x T # K : n x M # X1ss : n x d # X2s : M x d # Kv : n x T # out : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) rest_coef = 2 * M * T if v is not None else M * T n, d = select_dim_over_nd(max_n=N, max_d=D, coef_nd=1, coef_n=M + T + 1, coef_d=M, rest=rest_coef + M, max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) s1 = tcd.Stream(ddev) s2 = tcd.Stream(ddev) with tcd.device(ddev), tcd.stream(s1): # First collect necessary memory mem_needed = n * M + n * T + n + M if not cuda_inputs: mem_needed += n * d + M * d if v is not None: mem_needed += M * T if not out.is_cuda: mem_needed += M * T # Create flat tensor flat_gpu_tn = torch.empty(size=(mem_needed, ), dtype=dtype, device=ddev) # Extract the sub-tensors flat_offset = 0 if v is not None: if not cuda_inputs: v_gpu = extract_same_stride(flat_gpu_tn, size=(M, T), other=v, offset=flat_offset) flat_offset += np.prod(v_gpu.shape) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0) else: v_gpu = v K_gpu = extract_same_stride(flat_gpu_tn, size=(n, M), other=X1, offset=flat_offset) flat_offset += np.prod(K_gpu.shape) Kv_gpu = extract_same_stride(flat_gpu_tn, size=(n, T), other=X1, offset=flat_offset) flat_offset += np.prod(Kv_gpu.shape) if out.is_cuda: out_gpu = out else: out_gpu = extract_same_stride(flat_gpu_tn, size=(M, T), other=out, offset=flat_offset) flat_offset += np.prod(out_gpu.shape) out_gpu.fill_(0.0) if not cuda_inputs: X1ss_gpu = extract_same_stride(flat_gpu_tn, size=(n, d), other=X1, offset=flat_offset) flat_offset += np.prod(X1ss_gpu.shape) X2s_gpu = extract_same_stride(flat_gpu_tn, size=(M, d), other=X2, offset=flat_offset) flat_offset += np.prod(X2s_gpu.shape) sq1_gpu = extract_same_stride(flat_gpu_tn, size=(n, ), other=X1, offset=flat_offset) flat_offset += np.prod(sq1_gpu.shape) sq2_gpu = extract_same_stride(flat_gpu_tn, size=(M, ), other=X1, offset=flat_offset) for i in range(0, N, n): nb = min(N - i, n) cur_K_gpu = K_gpu[:nb] # nb x M cur_K_gpu.fill_(0.0) for j in range(0, D, d): db = min(D - j, d) s1.synchronize( ) # need that the add_(sq2_gpu.T) op is complete to avoid overwrite # Parallelize two matrix transfers with tcd.stream(s2): if cuda_inputs: cur_X2s_gpu = X2[:, j:j + db] else: cur_X2s_gpu = copy_to_device_noorder(M, db, X2, 0, j, X2s_gpu, 0, 0, s=s2) torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) if cuda_inputs: cur_X1ss_gpu = X1[i:i + nb, j:j + db] else: cur_X1ss_gpu = copy_to_device_noorder(nb, db, X1, i, j, X1ss_gpu, 0, 0, s=s1) torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True, out=sq1_gpu).pow_(2) s2.synchronize( ) # need that cur_X2s_gpu and sq2_gpu are available. cur_K_gpu.addmm_(mat1=cur_X1ss_gpu, mat2=cur_X2s_gpu.T, alpha=-2.0) cur_K_gpu.add_(sq1_gpu) cur_K_gpu.add_(sq2_gpu.T) cur_K_gpu.clamp_min_(0) cur_K_gpu = kernel._transform(cur_K_gpu) if w is not None: cur_Kv_gpu = copy_to_device_noorder(nb, T, w, i, 0, Kv_gpu, 0, 0, s=s1) # n x T if v is not None: cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu) else: # v cannot be None if w is None cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb) # n x T torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu) # n x T # Multiply transposed kernel with the Kv result. out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu) # M x T if not out.is_cuda: copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0) s1.synchronize() return out
def generic_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype cuda_inputs = X1.is_cuda N, D = X1.size() M = X2.shape[0] if v is None: T = w.shape[1] else: T = v.shape[1] # Memory usage: # v : M x T # K : n x M # X1d : n x d # X2d : M x d # Kv : n x T # out2 : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) # FIXME: There seems to be a bug where if we let avail_mem like it is # for 32-bit data-types some copy fails. In such case we need # to free up some more memory and then everything runs fine. if sizeof_dtype(dtype) == 4: avail_mem /= 2 rest_coef = 2 * M * T if v is not None else M * T extra_mem = kernel.extra_mem() n, d = select_dim_over_nd( max_n=N, max_d=D, coef_nd=1 + extra_mem.get('nd', 0), coef_n=M + T + 1 + extra_mem.get('n', 0) + extra_mem.get('nm', 0) * M, coef_d=M + extra_mem.get('d', 0) + extra_mem.get('md', 0) * M, rest=rest_coef + M + extra_mem.get('m', 0), max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # First collect necessary memory mem_needed = n * M + n * T if not cuda_inputs: mem_needed += n * d + M * d + M * T if v is not None: mem_needed += M * T # Create flat tensor flat_gpu_tn = torch.empty(size=(mem_needed, ), dtype=dtype, device=ddev) # Extract the sub-tensors flat_offset = 0 ker_gpu = extract_same_stride(flat_gpu_tn, size=(n, M), other=out, offset=flat_offset) flat_offset += np.prod(ker_gpu.shape) w_gpu = extract_same_stride(flat_gpu_tn, size=(n, T), other=out, offset=flat_offset) flat_offset += np.prod(w_gpu.shape) if not cuda_inputs: X1s_gpu = extract_same_stride(flat_gpu_tn, size=(n, d), other=X1, offset=flat_offset) flat_offset += np.prod(X1s_gpu.shape) X2s_gpu = extract_same_stride(flat_gpu_tn, size=(M, d), other=X2, offset=flat_offset) flat_offset += np.prod(X2s_gpu.shape) out_gpu = extract_same_stride(flat_gpu_tn, size=(M, T), other=out, offset=flat_offset) flat_offset += np.prod(out_gpu.shape) if v is not None: v_gpu = extract_same_stride(flat_gpu_tn, size=(M, T), other=v, offset=flat_offset) flat_offset += np.prod(v_gpu.shape) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0) else: out_gpu = out if v is not None: v_gpu = v out_gpu.fill_(0.0) # Algorithm start for i in range(0, N, n): ic = min(n, N - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) c_g_ker = ker_gpu.narrow(0, 0, ic) c_g_ker.fill_(0.0) for k in range(0, D, d): kc = min(d, D - k) if cuda_inputs: c_g_X1s = X1[i:i + ic, k:k + kc] c_g_X2s = X2[:, k:k + kc] else: c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0, 0) c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0, 0) kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker) kernel._finalize(c_g_ker, ddd) if w is not None: c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0) else: c_g_w = w_gpu.narrow(0, 0, ic) c_g_w.fill_(0.0) if v is not None: c_g_w.addmm_(c_g_ker, v_gpu) out_gpu.addmm_(c_g_ker.T, c_g_w) if not cuda_inputs: copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def generic_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1, X2, v, out = a.X1, a.X2, a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype cuda_inputs = X1.is_cuda ntot, dtot = X1.size() M, T = v.size() # GPU Memory Usage: # ker_gpu : n*M # v_gpu : M*T # X1s_gpu : n*d # X2s_gpu : M*d # mmv_gpu : n*T # ---------- # total : n*d + n*(M+T) + d*M + M*T avail_mem = max_mem / sizeof_dtype(dtype) extra_mem = kernel.extra_mem() n, d = select_dim_over_nd( max_n=ntot, max_d=dtot, coef_nd=1 + extra_mem.get('nd', 0), coef_n=M + T + extra_mem.get('n', 0) + extra_mem.get('nm', 0) * M, coef_d=M + extra_mem.get('d', 0) + extra_mem.get('md', 0) * M, rest=M * T + extra_mem.get('m', 0), max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # First collect necessary memory mem_needed = n * M if not cuda_inputs: mem_needed += M * T + n * d + M * d + n * T # Create flat tensor flat_gpu_tn = torch.empty(size=(mem_needed, ), dtype=dtype, device=ddev) # Extract the sub-tensors flat_offset = 0 ker_gpu = extract_same_stride(flat_gpu_tn, size=(n, M), other=X1, offset=flat_offset) flat_offset += np.prod(ker_gpu.shape) if not cuda_inputs: X1s_gpu = extract_same_stride(flat_gpu_tn, size=(n, d), other=X1, offset=flat_offset) flat_offset += np.prod(X1s_gpu.shape) X2s_gpu = extract_same_stride(flat_gpu_tn, size=(M, d), other=X2, offset=flat_offset) flat_offset += np.prod(X2s_gpu.shape) mmv_gpu = extract_same_stride(flat_gpu_tn, size=(n, T), other=out, offset=flat_offset) flat_offset += np.prod(mmv_gpu.shape) v_gpu = extract_same_stride(flat_gpu_tn, size=(M, T), other=v, offset=flat_offset) flat_offset += np.prod(v_gpu.shape) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0) else: v_gpu = v for i in range(0, ntot, n): ic = min(n, ntot - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) c_g_ker = ker_gpu.narrow(0, 0, ic) c_g_ker.fill_(0.0) for k in range(0, dtot, d): kc = min(d, dtot - k) if cuda_inputs: c_g_X1s = X1[i:i + ic, k:k + kc] c_g_X2s = X2[:, k:k + kc] else: c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0, 0) c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0, 0) kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker) kernel._finalize(c_g_ker, ddd) # Multiply by the vector v if cuda_inputs: c_g_mmv = out[i:i + ic, :] else: c_g_mmv = mmv_gpu[:ic, :] torch.mm(c_g_ker, v_gpu, out=c_g_mmv) # n x T # Copy back to host if not cuda_inputs: copy_to_host_noorder(ic, T, c_g_mmv, 0, 0, out, i, 0) return out