def test_biRNN_bprop(backend_default, fargs, deltas_buffer): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiRNN(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() birnn.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() birnn.set_deltas(deltas_buffer) # same weight for bi-rnn backward and rnn weights birnn.W_input_b[:] = birnn.W_input_f birnn.W_recur_b[:] = birnn.W_recur_f birnn.b_b[:] = birnn.b_f birnn.dW[:] = 0 # same weight for bi-directional rnn init_glorot = GlorotUniform() rnn = Recurrent(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) rnn.configure(in_shape) rnn.prev_layer = True rnn.allocate() rnn.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() rnn.set_deltas(deltas_buffer) # inputs and views lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) # allocate gpu buffers inp_lr = birnn.be.array(lr) inp_rl = birnn.be.array(rl) # outputs out_lr_g = birnn.fprop(inp_lr) del_lr = birnn.bprop(out_lr_g).get().copy() birnn.h_buffer[:] = 0 out_rl_g = birnn.fprop(inp_rl) del_rl = birnn.bprop(out_rl_g).get().copy() del_lr_s = get_steps(del_lr, in_shape) del_rl_s = get_steps(del_rl, in_shape) for (x, y) in zip(del_lr_s, reversed(del_rl_s)): assert np.allclose(x, y, rtol=0.0, atol=1.0e-5)
def test_biRNN_fprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiRNN(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() birnn.set_deltas([birnn.be.iobuf(birnn.in_shape)]) # same weight nout = hidden_size birnn.W_input_b[:] = birnn.W_input_f birnn.W_recur_b[:] = birnn.W_recur_f birnn.b_b[:] = birnn.b_f birnn.dW[:] = 0 # inputs - random and flipped left-to-right inputs lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) inp_lr = birnn.be.array(lr) inp_rl = birnn.be.array(rl) # outputs out_lr = birnn.fprop(inp_lr).get().copy() birnn.h_buffer[:] = 0 out_rl = birnn.fprop(inp_rl).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert np.allclose(x_f, y_b, rtol=0.0, atol=1.0e-5) assert np.allclose(x_b, y_f, rtol=0.0, atol=1.0e-5)
def test_bibn(backend_default, fargs): seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) NervanaObject.be.bsz = batch_size hidden_size = min(10, hidden_size) # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiBNRNN(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() birnn.set_deltas([birnn.be.iobuf(birnn.in_shape)]) # test fprop # set the ff buffer inp_np = np.random.random(birnn.h_ff_buffer.shape) inp_be = birnn.be.array(inp_np) birnn.h_ff_buffer[:] = inp_np # compare the bn output with calling the backend bn xsum = birnn.be.zeros_like(birnn.xmean) xvar = birnn.be.zeros_like(birnn.xvar) gmean = birnn.be.zeros_like(birnn.gmean) gvar = birnn.be.zeros_like(birnn.gvar) gamma = birnn.be.ones(birnn.gamma.shape) beta = birnn.be.zeros_like(birnn.beta) grad_gamma = birnn.be.zeros_like(gamma) grad_beta = birnn.be.zeros_like(beta) out_ref = birnn.be.zeros_like(birnn.h_ff_buffer) xsum[:] = birnn.be.sum(birnn.h_ff_buffer, axis=1) birnn.be.compound_fprop_bn(birnn.h_ff_buffer, xsum, xvar, gmean, gvar, gamma, beta, out_ref, birnn.eps, birnn.rho, accumbeta=0, relu=False) # call the bibnrnn layer fprop_bn out_bn = birnn._fprop_bn(birnn.h_ff_buffer, inference=False) assert allclose_with_out(out_bn.get(), out_ref.get(), rtol=0.0, atol=1.0e-5) # test bprop err_np = np.random.random(birnn.h_ff_buffer.shape) err_be = birnn.be.array(err_np) err_out_ref = birnn.be.empty_like(err_be) birnn.be.compound_bprop_bn(err_out_ref, grad_gamma, grad_beta, err_be, inp_be, xsum, xvar, gamma, birnn.eps) err_out_bn = birnn._bprop_bn(err_be, out_bn) assert allclose_with_out(err_out_bn.get(), err_out_ref.get(), rtol=0.0, atol=2.5e-5)
# Setup the layers of the DNN # Softmax is performed in warp-ctc, so we use an Identity activation in the # final layer. gauss = Gaussian(scale=0.01) glorot = GlorotUniform() layers = [ Conv((nbands, filter_width, nfilters), init=gauss, bias=Constant(0), activation=Rectlin(), padding=dict(pad_h=0, pad_w=5), strides=dict(str_h=1, str_w=str_w)), DeepBiRNN(hidden_size, init=glorot, activation=Rectlinclip(), batch_norm=True, reset_cells=True, depth=depth), Affine(hidden_size, init=glorot, activation=Rectlinclip()), Affine(nout=nout, init=glorot, activation=Identity()) ] model = Model(layers=layers) opt = GradientDescentMomentumNesterov(learning_rate, momentum, gradient_clip_norm=gradient_clip_norm, stochastic_round=False) callbacks = Callbacks(model, eval_set=dev, **args.callback_args)
(nbands, filter_width, nfilters), init=gauss, bias=Constant(0), activation=Rectlin(), padding=dict( pad_h=0, pad_w=5), strides=dict( str_h=1, str_w=str_w)), DeepBiRNN( hidden_size, init=glorot, activation=Rectlinclip(), batch_norm=True, reset_cells=True, depth=depth), Affine( hidden_size, init=glorot, activation=Rectlinclip()), Affine( nout=nout, init=glorot, activation=Identity())] model = Model(layers=layers) opt = GradientDescentMomentum(learning_rate, momentum,