def run_cudnn_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512, layers=1, seq_len=512, warmup=10, benchmark=30, backward=False, skip_cpu_governor_check=False): benchmark_init(cpu, gpu, skip_cpu_governor_check) def V(x): return Variable(x) # mandatory input = V(torch.randn(seq_len, batch_size, input_size).cuda(gpu)) hx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu)) cx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu)) lstm = torch.nn.LSTM(input_size, hidden_size, layers).cuda(gpu) lstm.flatten_parameters() iter_timer = Bench(name='lstm_cudnn', cuda=True, warmup_iters=warmup) for i in range(warmup + benchmark): gc.collect() with iter_timer: hx_t, cx_t = lstm(input, (hx, cx)) if backward: hx_t.sum().backward() return iter_timer
def run_sru(cpu=0, gpu=0, jit=False, use_kernel=False, backward=False, warmup=10, benchmark=20): assert not (jit and use_kernel) benchmark_init(0, 0, True) # input has length 20, batch size 32 and dimension 128 x = Variable(torch.rand(20, 32, 128).cuda()) input_size, hidden_size = 128, 128 rnn = SRU( input_size, hidden_size, num_layers=2, # number of stacking RNN layers dropout=0.00001, # dropout applied between RNN layers rnn_dropout= 0.0001, # variational dropout applied on linear transformation use_tanh=1, # use tanh? use_relu=0, # use ReLU? bidirectional=False, # bidirectional RNN ? use_kernel=use_kernel, jit=jit, ) rnn.cuda() kernel_tag = '_kernel' if use_kernel else '' backward_tag = '_training' if backward else '_forward' jit_tag = '_jit' if jit else '' name = 'sru{}{}{}'.format(backward_tag, kernel_tag, jit_tag) iter_timer = Bench(cuda=True, name=name, warmup_iters=warmup) for _ in range(warmup + benchmark): gc.collect() with iter_timer: output, hidden = rnn(x) # forward pass if backward: output.sum().backward() # output is (length, batch size, hidden size * number of directions) # hidden is (layers, batch size, hidden size * number of directions) return iter_timer
def run_qrnn(batch_size=20, input_size=128, seq_len=20, warmup=10, benchmark=10, hidden_size=256, num_layers=10, use_kernel=False, jit=False, cuda=False): assert not (use_kernel and jit) if use_kernel: assert cuda benchmark_init(0, 0, True) name = 'qrnn{}{}{}'.format(tag(cuda=cuda), tag(jit=jit), tag(kernel=use_kernel)) iter_timer = Bench(name=name, cuda=cuda, warmup_iters=warmup) niters = warmup + benchmark size = (seq_len, batch_size, input_size) if cuda: device = torch.device('cuda:0') else: device = torch.device('cpu') batches = [ torch.rand(size, requires_grad=True, device=device) for _ in range(niters) ] qrnn = QRNN(input_size, hidden_size, num_layers=num_layers, dropout=0.4, use_kernel=use_kernel, jit=jit).to(device) for X in batches: gc.collect() with iter_timer: output, hidden = qrnn(X) output.sum().backward() return iter_timer
def run_tensor(broadcast=True): benchmark_init(0, 0, False) d = torch.zeros(1000, 1000) e = torch.zeros(1) def time_broadcast(): d * e def time_no_broadcast(): d * d if broadcast: fn = time_broadcast else: fn = time_no_broadcast name = "mul_bcast" if broadcast else "mul_no_bcast" iter_timer = Bench(name=name, cuda=False, warmup_iters=2) for _ in range(20): with iter_timer: fn() return iter_timer
def run_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512, seq_len=None, warmup=10, benchmark=20, autograd=False, variable=False, fused=False, jit=False, backward=False, skip_cpu_governor_check=False): if jit: autograd = True if backward: autograd = True if seq_len is None: if backward: seq_len = 32 else: seq_len = 512 assert not (jit and fused) assert not (variable and autograd) benchmark_init(cpu, gpu, skip_cpu_governor_check) if variable: V = lambda x, requires_grad=False: Variable(x, requires_grad=False) elif autograd: V = lambda x, requires_grad=False: Variable(x, requires_grad=requires_grad) else: V = lambda x, requires_grad=False: x input = V(torch.randn(batch_size, input_size).cuda(device=gpu)) hx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True) cx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True) w_ih = V(t_def(torch.randn(4 * hidden_size, input_size)).cuda(device=gpu), requires_grad=True) w_hh = V(t_def(torch.randn(4 * hidden_size, hidden_size)).cuda(device=gpu), requires_grad=True) if fused: if backward: print("using fused_autograd_lstm") lstm = fused_autograd_lstm else: print("using fused_forward_lstm") lstm = fused_autograd_lstm lstm = fused_lstm elif jit: print("tracing an unfused lstm") lstm = wrap_hidden(torch.jit.trace(input, hx0, cx0, w_ih, w_hh)(_unfused_lstm)) else: print("using unfused lstm") lstm = wrap_hidden(_unfused_lstm) name = 'lstm_cuda{}{}{}'.format(tag(autograd=autograd), tag(fused=fused), tag(jit=jit)) iter_timer = Bench(name=name, cuda=True, warmup_iters=warmup) for i in range(warmup + benchmark): gc.collect() with iter_timer: hx, cx = hx0, cx0 for j in range(seq_len): hx, cx = lstm(input, (hx, cx), w_ih, w_hh) if backward: hx.sum().backward() return iter_timer
def run_mlstm(cpu=0, gpu=0, batch_size=1, input_size=205, hidden_size=1900, embed_size=None, seq_len=20, warmup=10, benchmark=20, autograd=False, jit=False, backward=False, skip_cpu_governor_check=False): name = "mlstm_jit" if jit else "mlstm" iter_timer = Bench(name=name, cuda=True, warmup_iters=warmup) if embed_size is None: embed_size = hidden_size if jit or backward: autograd = True benchmark_init(cpu, gpu, skip_cpu_governor_check) requires_grad = autograd device = torch.device(gpu) input = torch.randn(seq_len, batch_size, input_size, requires_grad=requires_grad, device=device) hx = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device) cx = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device) w_xm = torch.randn(embed_size, input_size, requires_grad=requires_grad, device=device) w_hm = torch.randn(embed_size, hidden_size, requires_grad=requires_grad, device=device) w_ih = torch.randn(4 * hidden_size, input_size, requires_grad=requires_grad, device=device) w_mh = torch.randn(4 * hidden_size, embed_size, requires_grad=requires_grad, device=device) params = [input, hx, cx, w_xm, w_hm, w_ih, w_mh] if jit: mlstm = torch.jit.trace(input[0], hx, cx, w_xm, w_hm, w_ih, w_mh)(mlstm_raw) else: mlstm = mlstm_raw for _ in range(warmup + benchmark): gc.collect() with iter_timer: hx_t = hx cx_t = cx for j in range(seq_len): hx_t, cx_t = mlstm(input[j], hx_t, cx_t, w_xm, w_hm, w_ih, w_mh) if backward: hx_t.sum().backward() for param in params: param.grad.zero_() return iter_timer