def benchmark_chainer_sru(batchsize, seq_length, feature_dimension, repeat=50): layer = SRU(feature_dimension) x_data = np.random.normal(0, 1, size=(batchsize, feature_dimension, seq_length)).astype(np.float32) x_data = cuda.to_gpu(x_data) layer.to_gpu() with chainer.no_backprop_mode() and chainer.using_config("train", False): # forward start_time = time.time() for i in range(repeat): output, cell, last_cell = layer(x_data, None) forward_time_mean = (time.time() - start_time) / repeat with chainer.using_config("train", True): # backward start_time = time.time() for i in range(repeat): output, cell, last_cell = layer(x_data, None) layer.cleargrads() functions.sum(output).backward() backward_time_mean = (time.time() - start_time) / repeat return forward_time_mean, backward_time_mean
def check_dropout_backward(batchsize, feature_dimension, seq_length, use_tanh): x_cpu_data = np.random.normal( 0, 1, size=(batchsize, feature_dimension, seq_length * 3)).astype( np.float32) * 10 x_gpu_data = cuda.to_gpu(x_cpu_data, gpu_device) x_cpu = chainer.Variable(x_cpu_data) x_gpu = chainer.Variable(x_gpu_data) with chainer.using_config("train", True): # get true output layer = SRU(feature_dimension, use_tanh=use_tanh, dropout=0.5) mask_x = layer.generate_dropout_mask(x_cpu_data) output_true, cell_true, last_cell_true = autograd( x_cpu[..., :seq_length], layer.W, layer.B, None, layer.use_tanh, mask_x) output_true, cell_true, last_cell_true = autograd( x_cpu[..., seq_length:seq_length * 2], layer.W, layer.B, last_cell_true, layer.use_tanh, mask_x) output_true, cell_true, last_cell_true = autograd( x_cpu[..., seq_length * 2:], layer.W, layer.B, last_cell_true, layer.use_tanh, mask_x) layer.cleargrads() functions.sum(output_true).backward() b_grad_true = layer.B.grad.copy() w_grad_true = layer.W.grad.copy() x_grad_true = x_cpu.grad.copy() # print("last_cell_true") # print(last_cell_true) layer.to_gpu(gpu_device) output, cell, last_cell = layer(x_gpu[..., :seq_length], None, cuda.to_gpu(mask_x)) output, cell, last_cell = layer(x_gpu[..., seq_length:seq_length * 2], last_cell, cuda.to_gpu(mask_x)) output, cell, last_cell = layer(x_gpu[..., seq_length * 2:], last_cell, cuda.to_gpu(mask_x)) # print(np.mean(abs(output_true.data - cuda.to_cpu(output.data)))) # print(np.mean(abs(cell_true.data - cuda.to_cpu(cell.data)))) layer.cleargrads() functions.sum(output).backward() # print("last_cell") # print(last_cell) # print("layer.W.data") # print(layer.W.data) # print("b_grad") # print(b_grad) # print("b_grad") # print(layer.B.grad) # print("w_grad") # print(w_grad) # print("w_grad") # print(layer.W.grad) # print("x_grad") # print(x_cpu.grad) # print("x_grad") # print(x_gpu.grad) threshold = 1e-3 assert (xp.mean(abs(b_grad_true - cuda.to_cpu(layer.B.grad))) <= threshold), xp.mean(abs(b_grad_true - cuda.to_cpu(layer.B.grad))) assert (xp.mean(abs(w_grad_true - cuda.to_cpu(layer.W.grad))) <= threshold), xp.mean(abs(w_grad_true - cuda.to_cpu(layer.W.grad))) assert (xp.mean(abs(x_grad_true - cuda.to_cpu(x_gpu.grad))) <= threshold), xp.mean(abs(x_grad_true - cuda.to_cpu(x_gpu.grad)))