def test_no_overwrite(self, dtype, order, get_mat, expected_lower, expected_upper, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) # For cuda inputs we must add to available GPU memory the amount used by the # input matrix, since overwrite=False and a full copy must be performed. mgpu_slack = 0 if device.startswith("cuda"): mgpu_slack = self.basic_opt.max_gpu_mem + mat.shape[ 0]**2 * sizeof_dtype(mat.dtype) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=False, opt=new_opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_upper, act_up.cpu().numpy(), rtol=self.rtol[dtype]) np.testing.assert_allclose(omat, mat.cpu()) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=False, opt=new_opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_lower, act_lo.cpu().numpy(), rtol=self.rtol[dtype]) np.testing.assert_allclose(omat, mat.cpu())
def test_write_opposite(self, dtype, order, get_mat, expected_lower, expected_upper): omat = get_mat(order=order, dtype=dtype).numpy() mat = torch.from_numpy(omat.copy(order="K")) with memory_checker(self.basic_opt) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=True, write_opposite=True, opt=new_opt) np.testing.assert_allclose(np.triu(omat, k=1), np.triu(act_up.numpy(), k=1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.tril(act_up.numpy()), np.triu(expected_upper).T, rtol=self.rtol[dtype]) mat = torch.from_numpy(omat.copy(order="K")) with memory_checker(self.basic_opt) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=True, write_opposite=True, opt=new_opt) np.testing.assert_allclose(np.tril(omat, k=-1), np.tril(act_lo.numpy(), k=-1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.triu(act_lo.numpy()), np.tril(expected_lower).T, rtol=self.rtol[dtype])
def test_overwrite(self, dtype, order, get_mat, expected_lower, expected_upper): mat = get_mat(order=order, dtype=dtype).numpy().copy(order="K") with memory_checker(self.basic_opt) as new_opt: act_up = gpu_lauum(torch.from_numpy(mat), upper=True, overwrite=True, opt=new_opt) np.testing.assert_allclose(expected_upper, act_up.numpy(), rtol=self.rtol[dtype]) mat = get_mat(order=order, dtype=dtype).numpy().copy(order="K") with memory_checker(self.basic_opt) as new_opt: act_lo = gpu_lauum(torch.from_numpy(mat), upper=False, overwrite=True, opt=new_opt) np.testing.assert_allclose(expected_lower, act_lo.numpy(), rtol=self.rtol[dtype])
def run_potrf_test(np_data, dtype, order, opt, start_cuda, upper, clean, overwrite): # Convert pd_data to the appropriate form data = np.copy(np_data, order=order).astype(dtype, copy=False) lapack_fn, rtol = choose_on_dtype(dtype) A = torch.from_numpy(data.copy(order="K")) if start_cuda: A = A.cuda() orig_stride = A.stride() orig_ptr = A.data_ptr() with memory_checker(opt) as new_opt: C_gpu = gpu_cholesky(A, upper=upper, clean=clean, overwrite=overwrite, opt=new_opt) assert orig_stride == C_gpu.stride(), "gpu_potrf modified matrix stride." if overwrite: assert orig_ptr == C_gpu.data_ptr( ), "Data-pointer changed although overwrite is True." C_cpu = lapack_fn(data, lower=int(not upper), clean=int(clean), overwrite_a=int(overwrite))[0] np.testing.assert_allclose(C_cpu, C_gpu.cpu().numpy(), rtol=rtol, verbose=True)
def _run_test(fn, exp, tensors, out, rtol, opt): with memory_checker(opt) as new_opt: actual = fn(*tensors, out=out, opt=new_opt) # Check 1. Accuracy np.testing.assert_allclose(exp, actual, rtol=rtol) # Check 2. Output pointers if out is not None: assert out.data_ptr() == actual.data_ptr(), "Output data tensor was not used"
def test_write_opposite(self, dtype, order, get_mat, expected_lower, expected_upper, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) mgpu_slack = mat.shape[0]**2 * sizeof_dtype(mat.dtype) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=False, write_opposite=True, opt=new_opt) act_up = act_up.cpu() np.testing.assert_allclose(np.triu(omat, k=1), np.triu(act_up.numpy(), k=1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.tril(act_up.numpy()), np.triu(expected_upper).T, rtol=self.rtol[dtype]) mat = get_mat(order=order, dtype=dtype, device=device) with memory_checker(self.basic_opt) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=True, write_opposite=True, opt=new_opt) torch.cuda.synchronize() act_lo = act_lo.cpu() np.testing.assert_allclose(np.tril(omat, k=-1), np.tril(act_lo.numpy(), k=-1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.triu(act_lo.numpy()), np.tril(expected_lower).T, rtol=self.rtol[dtype])
def _run_fmmv_test(fn, exp, tensors, out, rtol, opt): # TODO: On some systems (nest but not sperone), checking memory # usage for CPU functions fails miserably due to inconsistent # memory numbers being reported at random. We simply replace CPU # with a high number to avoid checking. extra_mem = 10 * 2**30 if opt.use_cpu else 0 opt = dataclasses.replace(opt, max_cpu_mem=opt.max_cpu_mem + extra_mem) with memory_checker(opt) as new_opt: actual = fn(*tensors, out=out, opt=new_opt) # Check 1. Accuracy np.testing.assert_allclose(exp, actual.cpu(), rtol=rtol) # Check 2. Output pointers if out is not None: assert out.data_ptr() == actual.data_ptr(), "Output data tensor was not used"
def _run_fmm_test(k_class, k_exp, A, B, out, dtype, rtol, opt): if isinstance(A, np.ndarray): A = torch.from_numpy(A.astype(dtype, copy=False)) if isinstance(B, np.ndarray): B = torch.from_numpy(B.astype(dtype, copy=False)) if out is not None and isinstance(out, np.ndarray): out = torch.from_numpy(out.astype(dtype, copy=False)) with memory_checker(opt) as new_opt: actual = k_class(A, B, out=out, opt=new_opt) np.testing.assert_allclose(k_exp, actual, rtol=rtol) if out is not None: # Check output pointers assert out.data_ptr() == actual.data_ptr( ), "Output data tensor was not used"