def main(): loops = int(sys.argv[1]) n = np.random.randint(0, 1234) num_samps = 2**16 cpu_sig, gpu_sig = rand_data_gen_gpu(num_samps) # Run baseline with scipy.signal.gauss_spline with prof.time_range("scipy_gauss_spline", 0): cpu_gauss_spline = signal.gauss_spline(cpu_sig, n) # Run CuPy version with prof.time_range("cupy_gauss_spline", 1): gpu_gauss_spline = gauss_spline(gpu_sig, n) # Compare results np.testing.assert_allclose(cpu_gauss_spline, cp.asnumpy(gpu_gauss_spline), 1e-3) # Run multiple passes to get average for _ in range(loops): with prof.time_range("cupy_gauss_spline_loop", 2): gpu_gauss_spline = gauss_spline(gpu_sig, n) cp.cuda.runtime.deviceSynchronize()
def main(): loops = int(sys.argv[1]) num_samps = 2**16 cpu_sig = np.random.rand(num_samps) + 1.0j * np.random.rand(num_samps) gpu_sig = cp.array(cpu_sig) # Run baseline with cupy_signal with prof.time_range("CuPy signal", 0): amp, phase, real, imag = cupy_signal(gpu_sig) # Run EWK version with prof.time_range("EWK signal", 1): amp_EWK, phase_EWK, real_EWK, imag_EWK = signal(gpu_sig) # Compare results cp.testing.assert_allclose(amp, amp_EWK, 1e-3) cp.testing.assert_allclose(phase, phase_EWK, 1e-3) cp.testing.assert_allclose(real, real_EWK, 1e-3) cp.testing.assert_allclose(imag, imag_EWK, 1e-3) # Run multiple passes to get average for _ in range(loops): with prof.time_range("cupy_signal_avg", 2): amp, phase, real, imag = cupy_signal(gpu_sig) cp.cuda.runtime.deviceSynchronize() # Run multiple passes to get average for _ in range(loops): with prof.time_range("ewk_signal_avg", 3): amp_EWK, phase_EWK, real_EWK, imag_EWK = signal(gpu_sig) cp.cuda.runtime.deviceSynchronize()
def main(): loops = int(sys.argv[1]) n = np.random.randint(0, 1234) num_samps = 2**16 cpu_sig = rand_data_gen(num_samps) # Run baseline with scipy.signal.gauss_spline with prof.time_range("scipy_gauss_spline", 0): cpu_gauss_spline = signal.gauss_spline(cpu_sig, n) # Run multiple passes to get average for _ in range(loops): with prof.time_range("scipy_gauss_spline_loop", 0): cpu_gauss_spline = signal.gauss_spline(cpu_sig, n)
def test_time_range(self): push_patch = mock.patch('cupy.cuda.nvtx.RangePush') pop_patch = mock.patch('cupy.cuda.nvtx.RangePop') with push_patch as push, pop_patch as pop: with prof.time_range('test:time_range', color_id=-1): pass push.assert_called_once_with('test:time_range', -1) pop.assert_called_once_with()
def update(self, lossfun=None, *args, **kwds): if self._sync: iter_sync = True with prof.time_range('iteration', sync=iter_sync, argb_color=_itr_argb_color): ret = self.actual_optimizer.update(lossfun, *args, **kwds) return ret
def test_time_range_with_ARGB(self): push_patch = mock.patch('cupy.cuda.nvtx.RangePushC') pop_patch = mock.patch('cupy.cuda.nvtx.RangePop') with push_patch as push, pop_patch as pop: with prof.time_range('test:time_range_with_ARGB', argb_color=0xFF00FF00): pass push.assert_called_once_with('test:time_range_with_ARGB', 0xFF00FF00) pop.assert_called_once_with()
def test_time_range_with_ARGB(self): push_patch = mock.patch('cupy.cuda.nvtx.RangePushC') pop_patch = mock.patch('cupy.cuda.nvtx.RangePop') with push_patch as push, pop_patch as pop: with prof.time_range('test:time_range_with_ARGB', argb_color=0xFF00FF00): pass push.assert_called_once_with( 'test:time_range_with_ARGB', 0xFF00FF00) pop.assert_called_once_with()
def test_time_range_err(self): push_patch = mock.patch('cupy.cuda.nvtx.RangePush') pop_patch = mock.patch('cupy.cuda.nvtx.RangePop') with push_patch as push, pop_patch as pop: try: with prof.time_range('test:time_range_error', -1): raise Exception() except Exception: pass push.assert_called_once_with('test:time_range_error', -1) pop.assert_called_once_with()
def backward(self, *args, **kwargs): if not self._sync: bwd_sync = False bwd_each_sync = False else: bwd_sync = (self._sync_level >= SyncLevel.SECOND) bwd_each_sync = (self._sync_level >= SyncLevel.FINEST) with prof.time_range('model.backward', sync=bwd_sync, argb_color=_bwd_argb_color): with FwdBwdProfileMarkHook(sync=bwd_each_sync, argb_color=_bwd_argb_color): ret = self._variable.backward(*args, **kwargs) return ret
def forward_wrapper(*args, **kwargs): if seprately_mark_for_iter and sync_level >= SyncLevel.COARSEST: range_push(sync, 'iteration', _itr_argb_color) if not sync: fwd_sync = False fwd_each_sync = False else: fwd_sync = (sync_level >= SyncLevel.SECOND) fwd_each_sync = (sync_level >= SyncLevel.FINEST) with prof.time_range('model.forward', sync=fwd_sync, argb_color=_fwd_argb_color): with FwdBwdProfileMarkHook(sync=fwd_each_sync, argb_color=_fwd_argb_color): loss = link._org_forward(*args, **kwargs) return _VariableWrapper(loss, sync, sync_level)
(libcudnn.CUDNN_PARAM_WDATA_PLACEHOLDER, ptr_ph), (libcudnn.CUDNN_PARAM_YDESC, y_desc), (libcudnn.CUDNN_PARAM_YDATA_PLACEHOLDER, ptr_ph), (libcudnn.CUDNN_PARAM_YSTATS_DESC, ysum_desc), (libcudnn.CUDNN_PARAM_YSUM_PLACEHOLDER, ptr_ph), (libcudnn.CUDNN_PARAM_YSQSUM_PLACEHOLDER, ptr_ph))) workspace_size = cudnn.make_fused_ops_plan(plan, const_pack) workspace = cupy.empty((workspace_size,), dtype=numpy.int8) # print('workspace_size: {}'.format(workspace_size)) var_pack = cudnn.create_fused_ops_variant_param_pack( ops, ((libcudnn.CUDNN_PTR_XDATA, x), (libcudnn.CUDNN_PTR_BN_EQSCALE, scale), (libcudnn.CUDNN_PTR_BN_EQBIAS, bias), (libcudnn.CUDNN_PTR_WDATA, w), (libcudnn.CUDNN_PTR_YDATA, y), (libcudnn.CUDNN_PTR_YSUM, ysum), (libcudnn.CUDNN_PTR_YSQSUM, ysqsum), (libcudnn.CUDNN_PTR_WORKSPACE, workspace), (libcudnn.CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, workspace_size))) with prof.time_range('fusedOpsExecute', color_id=1, sync=True): cudnn.fused_ops_execute(plan, var_pack) # print('per-channel ysum:\n{}'.format(ysum.reshape((y_c)))) print('per-channel ysqsum:\n{}'.format(ysqsum.reshape(y_c)))
x = x[r >= frac_points] y = A * np.cos(w * x + phi) f = np.linspace(0.01, 10, out_samps) # Use float32 if b32 passed if dtype == 'float32': x = x.astype(np.float32) y = y.astype(np.float32) f = f.astype(np.float32) d_x = cp.array(x) d_y = cp.array(y) d_f = cp.array(f) # Run baseline with scipy.signal.lombscargle with prof.time_range("scipy_lombscargle", 0): cpu_lombscargle = signal.lombscargle(x, y, f) # Run Numba version with prof.time_range("cupy_lombscargle", 1): gpu_lombscargle = lombscargle(d_x, d_y, d_f) # Copy result to host gpu_lombscargle = cp.asnumpy(gpu_lombscargle) # Compare results np.testing.assert_allclose(cpu_lombscargle, gpu_lombscargle, 1e-3) # Run multiple passes to get average for _ in range(loops): with prof.time_range("cupy_lombscargle_loop", 2):
def divide_chunks(l, n): # looping till length l for i in range(0, len(l), n): yield l[i:i + n] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--imageFolder", help="Image folder location", default=[]) parser.add_argument("--num_gpus", help="Number of GPUs", default=1, type=int) parser.add_argument("--num_read_processes", help="Number of Read Processes (DALI)", default=1, type=int) parser.add_argument("--batch_size", help="Read batch size", default=1, type=int) args = parser.parse_args() with prof.time_range("run", 0): run(args)
def test_time_range(self): with self.assertRaises(RuntimeError): with prof.time_range(''): pass