def test_wgan_cost(backend_default): """ Set up a Wasserstein GANCost transform and make sure cost and errors are getting computed correctly. """ be = backend_default cost = GANCost(func="wasserstein") y_data = be.iobuf(5).fill(1.) y_noise = be.iobuf(5).fill(2.) output = be.iobuf(1) expected = be.iobuf(1) delta = be.iobuf(5) # fprop for discriminator cost output[:] = cost(y_data, y_noise) expected[:] = be.sum(y_data - y_noise, axis=0) tensors_allclose(output, expected) # bprop for wasserstein cost delta[:] = cost.bprop_data(y_data) assert allclose_with_out(delta.get(), 1.) delta[:] = cost.bprop_noise(y_noise) assert allclose_with_out(delta.get(), -1.) delta[:] = cost.bprop_generator(y_noise) assert allclose_with_out(delta.get(), 1.)
def test_model_get_outputs_rnn(backend_default, data): dataset = PTB(50, path=data) dataiter = dataset.train_iter # weight initialization init = Constant(0.08) # model initialization layers = [ Recurrent(150, init, activation=Logistic()), Affine(len(dataiter.vocab), init, bias=init, activation=Rectlin()) ] model = Model(layers=layers) output = model.get_outputs(dataiter) assert output.shape == (dataiter.ndata, dataiter.seq_length, dataiter.nclass) # since the init are all constant and model is un-trained: # along the feature dim, the values should be all the same assert allclose_with_out(output[0, 0], output[0, 0, 0], rtol=0, atol=1e-4) assert allclose_with_out(output[0, 1], output[0, 1, 0], rtol=0, atol=1e-4) # along the time dim, the values should be increasing: assert np.alltrue(output[0, 2] > output[0, 1]) assert np.alltrue(output[0, 1] > output[0, 0])
def test_modified_gan_cost(backend_default): """ Set up a modified GANCost transform and make sure cost and errors are getting computed correctly. """ be = backend_default cost = GANCost(cost_type="dis", func="modified") y_data = be.iobuf(5).fill(1.) y_noise = be.iobuf(5).fill(2.) output = be.iobuf(1) expected = be.iobuf(1) delta = be.iobuf(5) # fprop for discriminator cost output[:] = cost(y_data, y_noise) expected[:] = -be.sum(be.safelog(y_data) + be.safelog(1-y_noise), axis=0) tensors_allclose(output, expected) # bprop for modified cost delta[:] = cost.bprop_data(y_data) assert allclose_with_out(delta.get(), -1. / 1) delta[:] = cost.bprop_noise(y_noise) assert allclose_with_out(delta.get(), 1. - 2.) delta[:] = cost.bprop_generator(y_noise) assert allclose_with_out(delta.get(), -1. / 2.)
def test_biSum(backend_default, fargs, deltas_buffer): seq_len, input_size, hidden_size, batch_size = fargs input_size *= 2 in_shape = (input_size, seq_len) NervanaObject.be.bsz = batch_size bisum = BiSum() bisum.configure(in_shape) bisum.prev_layer = True bisum.allocate() bisum.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() bisum.set_deltas(deltas_buffer) # inputs inp_np = np.random.random((input_size, seq_len * batch_size)) inp_be = bisum.be.array(inp_np) # outputs out_be = bisum.fprop(inp_be) del_be = bisum.bprop(out_be) out_ref = bisum.be.empty_like(out_be) out_ref[:] = inp_be[:input_size // 2] + inp_be[input_size // 2:] assert out_be.shape[0] * 2 == inp_be.shape[0] assert allclose_with_out(out_be.get(), out_ref.get(), rtol=0.0, atol=1.0e-5) assert allclose_with_out(del_be[:input_size // 2].get(), out_be.get(), rtol=0.0, atol=1.0e-5) assert allclose_with_out(del_be[input_size // 2:].get(), out_be.get(), rtol=0.0, atol=1.0e-5)
def compare_helper(op, inA, inB, ng, nc, dtype): numpy_result = math_helper(np, op, inA, inB, dtype=np.float32).astype(dtype) nervanaGPU_result = math_helper(ng, op, inA, inB, dtype=dtype).get() allclose_with_out(numpy_result, nervanaGPU_result, rtol=0, atol=1e-5) nervanaCPU_result = math_helper(nc, op, inA, inB, dtype=dtype).get() allclose_with_out(numpy_result, nervanaCPU_result, rtol=0, atol=1e-5)
def test_bibn(backend_default, fargs): seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) NervanaObject.be.bsz = batch_size hidden_size = min(10, hidden_size) # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiBNRNN(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() birnn.set_deltas([birnn.be.iobuf(birnn.in_shape)]) # test fprop # set the ff buffer inp_np = np.random.random(birnn.h_ff_buffer.shape) inp_be = birnn.be.array(inp_np) birnn.h_ff_buffer[:] = inp_np # compare the bn output with calling the backend bn xsum = birnn.be.zeros_like(birnn.xmean) xvar = birnn.be.zeros_like(birnn.xvar) gmean = birnn.be.zeros_like(birnn.gmean) gvar = birnn.be.zeros_like(birnn.gvar) gamma = birnn.be.ones(birnn.gamma.shape) beta = birnn.be.zeros_like(birnn.beta) grad_gamma = birnn.be.zeros_like(gamma) grad_beta = birnn.be.zeros_like(beta) out_ref = birnn.be.zeros_like(birnn.h_ff_buffer) xsum[:] = birnn.be.sum(birnn.h_ff_buffer, axis=1) birnn.be.compound_fprop_bn( birnn.h_ff_buffer, xsum, xvar, gmean, gvar, gamma, beta, out_ref, birnn.eps, birnn.rho, accumbeta=0, relu=False) # call the bibnrnn layer fprop_bn out_bn = birnn._fprop_bn(birnn.h_ff_buffer, inference=False) assert allclose_with_out(out_bn.get(), out_ref.get(), rtol=0.0, atol=1.0e-5) # test bprop err_np = np.random.random(birnn.h_ff_buffer.shape) err_be = birnn.be.array(err_np) err_out_ref = birnn.be.empty_like(err_be) birnn.be.compound_bprop_bn(err_out_ref, grad_gamma, grad_beta, err_be, inp_be, xsum, xvar, gamma, birnn.eps) err_out_bn = birnn._bprop_bn(err_be, out_bn) assert allclose_with_out(err_out_bn.get(), err_out_ref.get(), rtol=0.0, atol=2.5e-5)
def test_all_rand(backend_default, allrand_args, deltas_buffer): # test with random weights and random inputs dtypeu = np.float32 w_rng, rngmax = allrand_args inp_rng = [0.0, rngmax] nin = 1024 nout = 2048 batch_size = 16 NervanaObject.be.bsz = batch_size init_unif = Uniform(low=w_rng[0], high=w_rng[1]) layer = Linear(nout=nout, init=init_unif) inp = np.random.random((nin, batch_size)) inp *= inp_rng[1] - inp_rng[0] inp += inp_rng[0] inp = inp.astype(dtypeu) layer.configure(nin) layer.prev_layer = True # Hack to force delta buffer allocation layer.allocate() layer.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() layer.set_deltas(deltas_buffer) out = layer.fprop(layer.be.array(inp)).get() w = layer.W.get() # the expected output using numpy out_exp = np.dot(w, inp) # for larger layers need to estimate numerical precision atol = 2 * est_mm_prec(w, inp, ntrials=1) assert allclose_with_out(out_exp, out, atol=atol, rtol=0.0), \ '%e %e' % (np.max(np.abs(out - out_exp)), atol) err = np.random.random((nout, batch_size)) err = err * (inp_rng[1] - inp_rng[0]) + inp_rng[0] err = err.astype(dtypeu) deltas = layer.bprop(layer.be.array(err)).get() dw = layer.dW.get() deltas_exp = np.dot(w.T, err) atol = 2 * est_mm_prec(w.T, err, ntrials=1) assert allclose_with_out(deltas_exp, deltas, atol=atol, rtol=0.0), \ '%e %e' % (np.max(np.abs(deltas_exp - deltas)), atol) dw_exp = np.dot(err, inp.T) atol = 2 * est_mm_prec(err, inp.T, ntrials=1) assert allclose_with_out(dw_exp, dw, atol=atol, rtol=0.0), \ '%e %e' % (np.max(np.abs(dw_exp - dw)), atol) return
def test_recurrent_mean(backend_default, refgruargs, deltas_buffer): seq_len, nin, batch_size = refgruargs NervanaObject.be.bsz = batch_size in_shape = (nin, seq_len) layer = RecurrentMean() layer.configure(in_shape) layer.prev_layer = True layer.allocate() layer.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() layer.set_deltas(deltas_buffer) # zeros inp = layer.be.zeros((nin, seq_len * batch_size)) out = layer.fprop(inp) err = layer.bprop(out).get() assert np.all(out.get() == np.zeros((nin, batch_size))) assert np.all(err == inp.get()) # ones inp = layer.be.ones((nin, seq_len * batch_size)) out = layer.fprop(inp) err = layer.bprop(out).get() assert np.all(out.get() == np.ones((nin, batch_size))) assert np.all(err == 1. / seq_len * inp.get()) # random rinp = np.random.random((nin, batch_size)) inp = np.repeat(rinp, repeats=seq_len, axis=1) inp_g = layer.be.array(inp) out = layer.fprop(inp_g) err = layer.bprop(out) assert allclose_with_out(out.get(), rinp) assert allclose_with_out(err.get(), 1. / seq_len * inp) # full random inp = np.random.random((nin, seq_len * batch_size)) inp_g = layer.be.array(inp) out = layer.fprop(inp_g) err = layer.bprop(out) out_comp = np.zeros(out.shape) err_comp = np.zeros(inp.shape) for i in range(seq_len): out_comp[:] = out_comp + inp[:, i * batch_size:(i + 1) * batch_size] err_comp[:, i * batch_size:(i + 1) * batch_size] = out.get() / float(seq_len) out_comp[:] /= float(seq_len) assert allclose_with_out(out_comp, out.get()) assert allclose_with_out(err_comp, err.get())
def test_schedule(backend_default): """ Test constant rate, fixed step and various modes of programmable steps. """ lr_init = 0.1 # default scheduler has a constant learning rate sch = Schedule() for epoch in range(10): lr = sch.get_learning_rate(learning_rate=lr_init, epoch=epoch) assert lr == lr_init # test a uniform step schedule step_config = 2 change = 0.5 sch = Schedule(step_config=step_config, change=change) for epoch in range(10): lr = sch.get_learning_rate(learning_rate=lr_init, epoch=epoch) # test a repeated call for the same epoch lr2 = sch.get_learning_rate(learning_rate=lr_init, epoch=epoch) # print epoch, lr, lr2 assert allclose_with_out(lr, lr_init * change**(np.floor(epoch // step_config))) assert allclose_with_out(lr2, lr_init * change**(np.floor(epoch // step_config))) # test a list step schedule sch = Schedule(step_config=[2, 3], change=.1) assert allclose_with_out(.1, sch.get_learning_rate(learning_rate=.1, epoch=0)) assert allclose_with_out(.1, sch.get_learning_rate(learning_rate=.1, epoch=1)) assert allclose_with_out(.01, sch.get_learning_rate(learning_rate=.1, epoch=2)) # test a repeated call for the same epoch assert allclose_with_out(.01, sch.get_learning_rate(learning_rate=.1, epoch=2)) assert allclose_with_out(.001, sch.get_learning_rate(learning_rate=.1, epoch=3)) assert allclose_with_out(.001, sch.get_learning_rate(learning_rate=.1, epoch=4))
def test_recurrent_last(backend_default, refgruargs, deltas_buffer): seq_len, nin, batch_size = refgruargs NervanaObject.be.bsz = batch_size in_shape = (nin, seq_len) layer = RecurrentLast() layer.configure(in_shape) layer.prev_layer = True layer.allocate() layer.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() layer.set_deltas(deltas_buffer) # zeros inp = layer.be.zeros((nin, seq_len * batch_size)) out = layer.fprop(inp) err = layer.bprop(out).get() assert np.all(out.get() == np.zeros((nin, batch_size))) assert np.all(err == inp.get()) # ones inp = layer.be.ones((nin, seq_len * batch_size)) out = layer.fprop(inp) err = layer.bprop(out).get() assert np.all(out.get() == np.ones((nin, batch_size))) assert np.all(err[:, -batch_size:] == inp.get()[:, -batch_size:]) assert np.all( err[:, :-batch_size] == np.zeros((nin, (seq_len - 1) * batch_size))) # random rinp = np.random.random((nin, batch_size)) inp = np.repeat(rinp, repeats=seq_len, axis=1) inp_g = layer.be.array(inp) out = layer.fprop(inp_g) err = layer.bprop(out) assert allclose_with_out(out.get(), rinp) assert allclose_with_out(err[:, -batch_size:].get(), rinp) # full random inp = np.random.random((nin, seq_len * batch_size)) inp_g = layer.be.array(inp) out = layer.fprop(inp_g) err = layer.bprop(out) out_comp = np.zeros(out.shape) err_comp = np.zeros(inp.shape) out_comp[:] = inp[:, -batch_size:] err_comp[:, -batch_size:] = out.get()
def test_biLSTM_fprop(backend_default, fargs): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() bilstm = BiLSTM(hidden_size, gate_activation=Logistic(), init=init_glorot, activation=Tanh(), reset_cells=True) bilstm.configure(in_shape) bilstm.prev_layer = True bilstm.allocate() # same weight nout = hidden_size bilstm.W_input_b[:] = bilstm.W_input_f bilstm.W_recur_b[:] = bilstm.W_recur_f bilstm.b_b[:] = bilstm.b_f bilstm.dW[:] = 0 # inputs - random and flipped left-to-right inputs lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) inp_lr = bilstm.be.array(lr) inp_rl = bilstm.be.array(rl) # outputs out_lr = bilstm.fprop(inp_lr).get().copy() bilstm.h_buffer[:] = 0 out_rl = bilstm.fprop(inp_rl).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) # asserts for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5) assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5)
def checkSequentialMatchesBatch(): """ check LSTM I/O forward/backward interactions """ n, b, d = (5, 3, 4) # sequence length, batch size, hidden size input_size = 10 WLSTM = LSTM.init(input_size, d) # input size, hidden size X = np.random.randn(n, b, input_size) h0 = np.random.randn(b, d) c0 = np.random.randn(b, d) # sequential forward cprev = c0 hprev = h0 caches = [{} for t in range(n)] Hcat = np.zeros((n, b, d)) for t in range(n): xt = X[t:t + 1] _, cprev, hprev, cache = LSTM.forward(xt, WLSTM, cprev, hprev) caches[t] = cache Hcat[t] = hprev # sanity check: perform batch forward to check that we get the same thing H, _, _, batch_cache = LSTM.forward(X, WLSTM, c0, h0) assert allclose_with_out(H, Hcat), 'Sequential and Batch forward don''t match!' # eval loss wrand = np.random.randn(*Hcat.shape) # loss = np.sum(Hcat * wrand) dH = wrand # get the batched version gradients BdX, BdWLSTM, Bdc0, Bdh0 = LSTM.backward(dH, batch_cache) # now perform sequential backward dX = np.zeros_like(X) dWLSTM = np.zeros_like(WLSTM) dc0 = np.zeros_like(c0) dh0 = np.zeros_like(h0) dcnext = None dhnext = None for t in reversed(range(n)): dht = dH[t].reshape(1, b, d) dx, dWLSTMt, dcprev, dhprev = LSTM.backward( dht, caches[t], dcnext, dhnext) dhnext = dhprev dcnext = dcprev dWLSTM += dWLSTMt # accumulate LSTM gradient dX[t] = dx[0] if t == 0: dc0 = dcprev dh0 = dhprev # and make sure the gradients match neon_logger.display('Making sure batched version agrees with sequential version: ' '(should all be True)') neon_logger.display(np.allclose(BdX, dX)) neon_logger.display(np.allclose(BdWLSTM, dWLSTM)) neon_logger.display(np.allclose(Bdc0, dc0)) neon_logger.display(np.allclose(Bdh0, dh0))
def test_padding(backend_default, poolargs): fshape, nifm, padding, stride, in_sz, batch_size = poolargs NervanaObject.be.bsz = batch_size # basic sanity check with random inputs inshape = (nifm, in_sz, in_sz) insize = np.prod(inshape) neon_layer = Pooling(fshape=fshape, strides=stride, padding=padding) inp = neon_layer.be.array(np.random.random((insize, batch_size))) inp.lshape = inshape neon_layer.configure(inshape) neon_layer.prev_layer = True neon_layer.allocate() neon_layer.set_deltas([neon_layer.be.iobuf(inshape)]) out = neon_layer.fprop(inp).get() ncheck = [0, batch_size // 2, batch_size - 1] (out_exp, check_inds) = ref_pooling(inp, inp.lshape, (fshape, fshape), padding, (stride, stride), neon_layer.be, ncheck=ncheck) out_shape = list(out_exp.shape[0:3]) out_shape.append(batch_size) outa = out.reshape(out_shape) assert allclose_with_out(out_exp, outa[:, :, :, check_inds], atol=0.0, rtol=0.0)
def test_hdf5meansubtract(backend_default, meansubhdf): NervanaObject.be.bsz = 128 bsz = 128 datit = HDF5Iterator(meansubhdf[0]) datit.allocate() typ = meansubhdf[1] mn = datit.mean.get() assert typ in ['chan_mean', 'full_mean'] cnt_image = 0 max_len = datit.ndata MAX_CNT = max_len*datit.inp.shape[1] for x, t in datit: x_ = x.get().flatten() x_exp = (np.arange(len(x_)) + cnt_image) % MAX_CNT x_exp = x_exp.reshape((-1, np.prod(datit.lshape))).T if typ == 'chan_mean': x_exp = x_exp.reshape((datit.lshape[0], -1)) - mn elif typ == 'full_mean': x_exp = x_exp.reshape((-1, bsz)) - mn x_exp = x_exp.flatten() assert allclose_with_out(x_, x_exp, atol=0.0, rtol=1.0e-7) cnt_image += len(x_) datit.cleanup()
def test_linear_ones(backend_default, basic_linargs, deltas_buffer): # basic sanity check with all ones on the inputs # and weights, check that each row in output # is the sum of the weights for that output # this check will confirm that the correct number # of operations is being run nin, nout, batch_size = basic_linargs NervanaObject.be.bsz = batch_size dtypeu = np.float32 init_unif = Uniform(low=1.0, high=1.0) layer = Linear(nout=nout, init=init_unif) inp = layer.be.array(dtypeu(np.ones((nin, batch_size)))) layer.configure(nin) layer.prev_layer = True # Hack to force delta buffer allocation layer.allocate() layer.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() layer.set_deltas(deltas_buffer) out = layer.fprop(inp).get() w = layer.W.get() sums = np.sum(w, 1).reshape((nout, 1)) * np.ones((1, batch_size)) # for larger layers need to estimate numerical precision # atol = est_mm_prec(w, inp.get()) assert allclose_with_out(sums, out, atol=0.0, rtol=0.0), \ '%e' % np.max(np.abs(out - sums)) return
def test_softmax_big_inputs(backend_default): np.random.seed(1) be = backend_default assert be.bsz >= 128, 'This tests needs large batch size' act = Softmax() Nout = 1000 # 1000 input and output units to softmax # random inputs x_ = np.random.random((Nout, be.bsz)) x = be.iobuf(Nout) # init input to softmax x[:] = x_ # numpy softmax mx = np.max(x_, axis=0) ex = np.exp(x_ - mx) y_ = ex/np.sum(ex, axis=0) # in-place softmax on device x[:] = act(x) assert allclose_with_out(y_, x.get(), atol=0.0, rtol=1.0e-5)
def gradient_check(seq_len, input_size, hidden_size, batch_size, threshold=1.0e-3): # 'threshold' is the max fractional difference # between gradient estimate and # bprop deltas (def is 5%) # for a given set of layer parameters calculate # the gradients and compare to the derivatives # obtained with the bprop function. repeat this # for a range of perturbations and use the # perturbation size with the best results. # This is necessary for 32 bit computations min_max_err = -1.0 # minimum max error neon_logger.display('Perturb mag, max grad diff') for pert_exp in range(-5, 0): # need to generate the scaling and input outside # having an issue with the random number generator # when these are generated inside the gradient_calc # function input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) rand_scale = np.random.random(output_shape) * 2.0 - 1.0 inp = np.random.randn(*input_shape) pert_mag = 10.0**pert_exp (grad_est, deltas) = gradient_calc(seq_len, input_size, hidden_size, batch_size, epsilon=pert_mag, rand_scale=rand_scale, inp_bl=inp) dd = np.max(np.abs(grad_est - deltas)) neon_logger.display('%e, %e' % (pert_mag, dd)) if min_max_err < 0.0 or dd < min_max_err: min_max_err = dd # reset the seed so models are same in each run allclose_with_out(grad_est, deltas, rtol=0.0, atol=0.0) NervanaObject.be.rng_reset() # check that best value of worst case error is less than threshold neon_logger.display('Worst case error %e with perturbation %e' % (min_max_err, pert_mag)) neon_logger.display('Threshold %e' % (threshold)) assert min_max_err < threshold
def test_dilated_conv(backend_default, fargs_tests): fsz = fargs_tests[0] dil = fargs_tests[1] stride = fargs_tests[2] be = backend_default o1, w1 = run(be, False, fsz, stride, 1, dil) o2, w2 = run(be, True, fsz, stride, 1, dil) # Verify that the results of faked dilation match those of actual dilation. assert allclose_with_out(o1, o2, atol=1e-1, rtol=4e-3) try: assert allclose_with_out(w1, w2, atol=0, rtol=1e-3) except Exception: if not isinstance(NervanaObject.be, NervanaGPU): assert allclose_with_out(w1, w2, atol=1e-1, rtol=1e-3) else: assert allclose_with_out(w1, w2, atol=0, rtol=1e-3)
def test_dconv_rand(backend_default, rand_convargs, deltas_buffer): indim, nifm, fshape, nofm, batch_size, rngmax, w_rng = rand_convargs if isinstance(NervanaObject.be, NervanaGPU) and NervanaObject.be.compute_capability < (5, 0): if nofm % 4 != 0: pytest.skip(msg="C dim must be a multiple of 4 for Kepler bprop kernel") NervanaObject.be.bsz = batch_size dtypeu = np.float32 inp_rng = [0.0, rngmax] init_unif = Uniform(low=w_rng[0], high=w_rng[1]) inshape = (indim, indim, nifm) insize = np.prod(inshape) # generate neon deconv layer # need to switch to nofm here... neon_layer = Deconvolution(fshape=(fshape, fshape, nofm), strides=1, padding=0, init=init_unif) insize = np.prod(inshape) # generate reference deconv layer ref_layer = DeconvRefLayer(1, batch_size, identity, inshape[0], inshape[1:3], (fshape, fshape), nofm, 1, dtypeu) # setup input in range inp_rng inpa = np.random.random((insize, batch_size)) inpa *= (inp_rng[1] - inp_rng[0]) inpa += inp_rng[0] inpa = inpa.astype(dtypeu) inp = neon_layer.be.array(inpa) inp.lshape = inshape # run fprop on neon neon_layer.configure(inshape) neon_layer.prev_layer = True neon_layer.allocate() neon_out = neon_layer.fprop(inp).get() neon_layer.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() neon_layer.set_deltas(deltas_buffer) # pull neon weights into ref layer weights ref_layer.weights = neon_layer.W.get().T ref_out = np.copy(ref_layer.berror) # estimate the numerical precision ref_layer.fprop(inpa.T, permute=True) ref_out2 = ref_layer.berror atol = 10 * np.max(np.abs(ref_out - ref_out2)) assert allclose_with_out(ref_out.T, neon_out, atol=atol, rtol=0.0), \ '%e %e' % (np.max(np.abs(ref_out.T - neon_out)), atol) # generate err array erra = np.random.random(neon_out.shape) erra *= (inp_rng[1] - inp_rng[0]) erra += inp_rng[0] erra = erra.astype(dtypeu)
def test_shift_schedule(backend_default): """ Test binary shift learning rate schedule """ lr_init = 0.1 interval = 1 sch = ShiftSchedule(interval) for epoch in range(10): lr = sch.get_learning_rate(learning_rate=lr_init, epoch=epoch) assert allclose_with_out(lr, lr_init / (2 ** epoch))
def test_exp_schedule(backend_default): """ Test exponential learning rate schedule """ lr_init = 0.1 decay = 0.01 sch = ExpSchedule(decay) for epoch in range(10): lr = sch.get_learning_rate(learning_rate=lr_init, epoch=epoch) assert allclose_with_out(lr, lr_init / (1. + decay * epoch))
def test_power_schedule(backend_default): """ Test the PowerSchedule class """ sch = PowerSchedule(step_config=2, change=0.5) target_lr = [1.0, 1.0, 0.5, 0.5, 0.25, 0.25, 0.125, 0.125] for e, lr in enumerate(target_lr): assert allclose_with_out(lr, sch.get_learning_rate(learning_rate=1.0, epoch=e))
def test_lookuptable_rand_error(backend_default, basic_linargs, deltas_buffer): nin, nout, batch_size, vocab_size = basic_linargs NervanaObject.be.bsz = batch_size dtypeu = np.float32 init_glorot = GlorotUniform() layer = LookupTable( vocab_size=vocab_size, embedding_dim=nout, init=init_glorot) inp = np.random.random_integers(0, vocab_size - 1, size=nin * batch_size) layer.configure(nin) layer.allocate() layer.prev_layer = True # Hack to force delta buffer allocation layer.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() layer.set_deltas(deltas_buffer) inputs = layer.be.array(inp.reshape((nin, batch_size))) out = layer.fprop(inputs).get() W = layer.W.get() for i in range(nin * batch_size): assert np.all(W[inp[i]].T == out[:, i]) err = dtypeu(np.random.random((nout, nin * batch_size))) layer.bprop(layer.be.array(err)).get() dw = layer.dW.get() unqidx, count = np.unique(inp, return_counts=True) dw_exp = np.zeros((1, nout)) for wrd_id, cnt in zip(unqidx, count): dw_exp[:] = 0 cnt_exp = 0 for i, w_id in enumerate(inp): if w_id == wrd_id: dw_exp[:] = dw_exp[:] + err[:, i] cnt_exp += 1 assert allclose_with_out(dw[wrd_id, :], dw_exp, atol=0, rtol=1e-4) assert allclose_with_out(dw_exp, dw[wrd_id, :], atol=0, rtol=1e-4) assert cnt == cnt_exp return
def test_biRNN_bprop(backend_default, fargs, deltas_buffer): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiRNN(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() birnn.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() birnn.set_deltas(deltas_buffer) # same weight for bi-rnn backward and rnn weights birnn.W_input_b[:] = birnn.W_input_f birnn.W_recur_b[:] = birnn.W_recur_f birnn.b_b[:] = birnn.b_f birnn.dW[:] = 0 # same weight for bi-directional rnn init_glorot = GlorotUniform() rnn = Recurrent(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) rnn.configure(in_shape) rnn.prev_layer = True rnn.allocate() rnn.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() rnn.set_deltas(deltas_buffer) # inputs and views lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) # allocate gpu buffers inp_lr = birnn.be.array(lr) inp_rl = birnn.be.array(rl) # outputs out_lr_g = birnn.fprop(inp_lr) del_lr = birnn.bprop(out_lr_g).get().copy() birnn.h_buffer[:] = 0 out_rl_g = birnn.fprop(inp_rl) del_rl = birnn.bprop(out_rl_g).get().copy() del_lr_s = get_steps(del_lr, in_shape) del_rl_s = get_steps(del_rl, in_shape) for (x, y) in zip(del_lr_s, reversed(del_rl_s)): assert allclose_with_out(x, y, rtol=0.0, atol=1.0e-5)
def test_roipooling_bprop_ref(backend_default, rois=None, inputs=None, outputs_fprop_ref=None, input_errors=None): if rois is None and inputs is None and outputs_fprop_ref is None and input_errors is None: return (bsz, img_fm_c, img_fm_h, img_fm_w) = inputs.shape (rois_per_batch, _, roi_size, _) = input_errors.shape outputs_fprop_ref_in = outputs_fprop_ref.reshape(rois_per_batch, -1).T feature_maps = inputs.reshape(bsz, -1).T.astype(np.float, order='C') input_errors_in = input_errors.reshape( rois_per_batch, -1).T.astype(np.float, order='C') # compare with GPU kernel, need to call fprop first, then bprop NervanaObject.be.bsz = bsz be = NervanaObject.be input_dev = be.array(feature_maps) rois_dev = be.array(rois) output_shape = (img_fm_c, roi_size, roi_size, rois_per_batch) outputs_dev = be.zeros(output_shape, dtype=np.float32) # make sure the type being int argmax_dev = be.zeros(output_shape, dtype=np.int32) input_error_dev = be.array(input_errors_in) output_error_dev = be.zeros(outputs_fprop_ref_in.shape) be.roipooling_fprop(input_dev, rois_dev, outputs_dev, argmax_dev, rois_per_batch, img_fm_c, img_fm_h, img_fm_w, roi_size, roi_size, spatial_scale) outputs_fprop_be = outputs_dev.get().reshape(-1, rois_per_batch) assert allclose_with_out( outputs_fprop_ref_in, outputs_fprop_be, atol=1e-6, rtol=0) start_time = timeit() be.roipooling_bprop(input_error_dev, rois_dev, output_error_dev, argmax_dev, rois_per_batch, img_fm_c, img_fm_h, img_fm_w, roi_size, roi_size, spatial_scale) neon_logger.display("NervanaGPU roipooling bprop (sec): {}".format(timeit() - start_time)) outputs_backend = output_error_dev.get() assert allclose_with_out(outputs_fprop_ref_in, outputs_backend, atol=1e-6, rtol=0)
def test_concat_sequence_l1_l1(backend_default, allrand_args, deltas_buffer): # test two linear layers that are merged with concat dtypeu = np.float32 w_rng, rngmax = allrand_args # Diff size input steps nin = 128 steps = [32, 64] nout = 256 batch_size = 16 NervanaObject.be.bsz = batch_size be = NervanaObject.be init_unif = Uniform(low=w_rng[0], high=w_rng[1]) layers = [Sequential(Affine(nout=nout, init=init_unif)) for _ in (0, 1)] inputs = [be.array(dtypeu(np.random.random((nin, batch_size * step)))) for step in steps] merge = MergeMultistream(layers, merge="recurrent") assert(len(inputs) == len(layers)) merge.configure(inputs) merge.allocate() merge.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() merge.set_deltas(deltas_buffer) out = merge.fprop(inputs).get() sublayers = [s.layers[0] for s in layers] weights = [layer.W.get() for layer in sublayers] out_exp = np.concatenate([np.dot(w, inp.get()) for (w, inp) in zip(weights, inputs)], axis=1) assert allclose_with_out(out, out_exp, atol=1e-3) err_lst = [dtypeu(np.random.random((nout, batch_size * step))) for step in steps] err_concat = be.array(np.concatenate(err_lst, axis=1)) merge.bprop(err_concat) dW_exp_lst = [np.dot(err, inp.get().T) for (err, inp) in zip(err_lst, inputs)] for layer, dW_exp in zip(sublayers, dW_exp_lst): assert allclose_with_out(layer.dW.get(), dW_exp) return
def test_step_schedule(backend_default): """ Test the StepSchedule class """ step_config = [1, 4, 5] change = [0.1, 0.3, 0.4] sch = StepSchedule(step_config=step_config, change=change) target_lr = [1.0, 0.1, 0.1, 0.1, 0.3, 0.4, 0.4, 0.4, 0.4] for e, lr in enumerate(target_lr): assert allclose_with_out(lr, sch.get_learning_rate(learning_rate=1.0, epoch=e))
def test_hard_coded(self): """ The most basic test case """ be = self.be x0 = be.array(np.ones((3, 3)) * 1, name='x0', dtype=self.dtype) x1 = be.array(np.ones((3, 3)) * 2, name='x1', dtype=self.dtype) x2 = be.array(np.ones((3, 3)) * 3, name='x2', dtype=self.dtype) x3 = be.array(np.ones((3, 3)) * 5, name='x3', dtype=self.dtype) f = x0 * x0 - x1 * x0 + x0 * x2 - x2 * x1 * x0 + x3 * x3 * x3 ad = Autodiff(f, be) x0_grad = be.array(np.ones((3, 3)) * -3, dtype=self.dtype) x1_grad = be.array(np.ones((3, 3)) * -4, dtype=self.dtype) x2_grad = be.array(np.ones((3, 3)) * -1, dtype=self.dtype) x3_grad = be.array(np.ones((3, 3)) * 75, dtype=self.dtype) assert allclose_with_out(ad.get_grad_asnumpyarray([x0])[0], x0_grad.get(), atol=1e-5) assert allclose_with_out(ad.get_grad_asnumpyarray([x1])[0], x1_grad.get(), atol=1e-5) assert allclose_with_out(ad.get_grad_asnumpyarray([x2])[0], x2_grad.get(), atol=1e-5) assert allclose_with_out(ad.get_grad_asnumpyarray([x3])[0], x3_grad.get(), atol=1e-5)
def test_roipooling_fprop_ref(backend_default, rois=None, inputs=None, outputs_ref=None): if rois is None and inputs is None and outputs_ref is None: return (bsz, img_fm_c, img_fm_h, img_fm_w) = inputs.shape (rois_per_batch, _, roi_size, _) = outputs_ref.shape outputs_ref_in = outputs_ref.reshape(rois_per_batch, -1).T rois_per_image = rois_per_batch // bsz feature_maps = inputs.reshape(bsz, -1).T.astype(np.float, order='C') # run the numpy roi fprop (function inside this test script) outputs_np = fprop_roipooling_ref(feature_maps, rois, img_fm_c, img_fm_h, img_fm_w, bsz, rois_per_image, roi_size, roi_size) assert allclose_with_out(outputs_ref_in, outputs_np, atol=1e-6, rtol=0) # call NervanaGPU roipooling kernel NervanaObject.be.bsz = bsz be = NervanaObject.be input_dev = be.array(feature_maps) rois_dev = be.array(rois) output_shape = (img_fm_c, roi_size, roi_size, rois_per_batch) outputs_dev = be.zeros(output_shape, dtype=np.float32) # make sure the type being int argmax_dev = be.zeros(output_shape, dtype=np.int32) start_time = timeit() be.roipooling_fprop(input_dev, rois_dev, outputs_dev, argmax_dev, rois_per_batch, img_fm_c, img_fm_h, img_fm_w, roi_size, roi_size, spatial_scale) outputs_backend = outputs_dev.get().reshape(-1, rois_per_batch) neon_logger.display("Nervana backend roipooling fprop (sec): {}".format(timeit() - start_time)) assert allclose_with_out(outputs_ref_in, outputs_backend, atol=1e-6, rtol=0)
def test_roipooling_fprop_random(backend_default, fargs): rois_per_image, img_fm_c, img_fm_h, img_fm_w, roi_size, bsz = fargs # generate a random feature map and some random ROIs feature_maps = np.random.random( (img_fm_c, img_fm_h, img_fm_w, bsz)).reshape(-1, bsz) rois_per_batch = rois_per_image * bsz rois_idx = np.vstack([i * np.ones((rois_per_image, 1)) for i in range(bsz)]) rois = np.random.random((rois_per_batch, 4)) * min(img_fm_h, img_fm_w) rois = np.zeros((rois_per_batch, 4)) rois[:, 0] = np.random.random((rois_per_batch,)) * 10 / spatial_scale rois[:, 1] = np.random.random((rois_per_batch,)) * 25 / spatial_scale rois[:, 2] = ( np.random.random((rois_per_batch,)) * 27 + (img_fm_w - 27)) / spatial_scale rois[:, 3] = ( np.random.random((rois_per_batch,)) * 12 + (img_fm_h - 12)) / spatial_scale rois = np.hstack((rois_idx, rois)) # run the numpy roi fprop (function inside this test script) outputs_np = fprop_roipooling_ref(feature_maps, rois, img_fm_c, img_fm_h, img_fm_w, bsz, rois_per_image, roi_size, roi_size) # call backend roipooling kernel NervanaObject.be.bsz = bsz be = NervanaObject.be input_dev = be.array(feature_maps) rois_dev = be.array(rois) output_shape = (img_fm_c, roi_size, roi_size, rois_per_batch) outputs_dev = be.zeros(output_shape) # make sure the type being int argmax_dev = be.zeros(output_shape, np.int32) start_time = timeit() be.roipooling_fprop(input_dev, rois_dev, outputs_dev, argmax_dev, rois_per_batch, img_fm_c, img_fm_h, img_fm_w, roi_size, roi_size, spatial_scale) neon_logger.display("Nervana backend roipooling fprop (sec): {}".format(timeit() - start_time)) outputs_be = outputs_dev.get().reshape(-1, rois_per_batch) assert allclose_with_out(outputs_np, outputs_be, atol=1e-6, rtol=0)
def test_model_serialize(backend_default, data): dataset = MNIST(path=data) (X_train, y_train), (X_test, y_test), nclass = dataset.load_data() train_set = ArrayIterator([X_train, X_train], y_train, nclass=nclass, lshape=(1, 28, 28)) init_norm = Gaussian(loc=0.0, scale=0.01) # initialize model path1 = Sequential([ Conv((5, 5, 16), init=init_norm, bias=Constant(0), activation=Rectlin()), Pooling(2), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()) ]) path2 = Sequential([ Affine(nout=100, init=init_norm, bias=Constant(0), activation=Rectlin()), Dropout(keep=0.5), Affine(nout=20, init=init_norm, bias=init_norm, activation=Rectlin()) ]) layers = [ MergeMultistream(layers=[path1, path2], merge="stack"), Affine(nout=20, init=init_norm, batch_norm=True, activation=Rectlin()), Affine(nout=10, init=init_norm, activation=Logistic(shortcut=True)) ] tmp_save = 'test_model_serialize_tmp_save.pickle' mlp = Model(layers=layers) mlp.optimizer = GradientDescentMomentum(learning_rate=0.1, momentum_coef=0.9) mlp.cost = GeneralizedCost(costfunc=CrossEntropyBinary()) mlp.initialize(train_set, cost=mlp.cost) n_test = 3 num_epochs = 3 # Train model for num_epochs and n_test batches for epoch in range(num_epochs): for i, (x, t) in enumerate(train_set): x = mlp.fprop(x) delta = mlp.cost.get_errors(x, t) mlp.bprop(delta) mlp.optimizer.optimize(mlp.layers_to_optimize, epoch=epoch) if i > n_test: break # Get expected outputs of n_test batches and states of all layers outputs_exp = [] pdicts_exp = [l.get_params_serialize() for l in mlp.layers_to_optimize] for i, (x, t) in enumerate(train_set): outputs_exp.append(mlp.fprop(x, inference=True)) if i > n_test: break # Serialize model mlp.save_params(tmp_save, keep_states=True) # Load model mlp = Model(tmp_save) mlp.initialize(train_set) outputs = [] pdicts = [l.get_params_serialize() for l in mlp.layers_to_optimize] for i, (x, t) in enumerate(train_set): outputs.append(mlp.fprop(x, inference=True)) if i > n_test: break # Check outputs, states, and params are the same for output, output_exp in zip(outputs, outputs_exp): assert allclose_with_out(output.get(), output_exp.get()) for pd, pd_exp in zip(pdicts, pdicts_exp): for s, s_e in zip(pd['states'], pd_exp['states']): if isinstance(s, list): # this is the batch norm case for _s, _s_e in zip(s, s_e): assert allclose_with_out(_s, _s_e) else: assert allclose_with_out(s, s_e) for p, p_e in zip(pd['params'], pd_exp['params']): assert type(p) == type(p_e) if isinstance(p, list): # this is the batch norm case for _p, _p_e in zip(p, p_e): assert allclose_with_out(_p, _p_e) elif isinstance(p, np.ndarray): assert allclose_with_out(p, p_e) else: assert p == p_e os.remove(tmp_save)
def check_lstm(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) hidden_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon LSTM lstm = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = lstm.be.array(inp) # run neon fprop lstm.configure((input_size, seq_len)) lstm.prev_layer = True # Hack to force allocating a delta buffer lstm.allocate() dtree = DeltasTree() lstm.allocate_deltas(dtree) dtree.allocate_buffers() lstm.set_deltas(dtree) lstm.fprop(inpa) # reference numpy LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size) # make ref weights and biases with neon model WLSTM[0, :] = lstm.b.get().T WLSTM[1:input_size + 1, :] = lstm.W_input.get().T WLSTM[input_size + 1:] = lstm.W_recur.get().T # transpose input X and do fprop inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size) (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM) # the output needs transpose as well Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T # compare results neon_logger.display('====Verifying IFOG====') assert allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying cell states====') assert allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying hidden states====') assert allclose_with_out(lstm.outputs.get(), Hout_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('fprop is verified') # now test the bprop # generate random deltas tensor deltas = np.random.randn(*hidden_shape) lstm.bprop(lstm.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = lstm.dW_input.get() dWrecur_neon = lstm.dW_recur.get() db_neon = lstm.db.get() deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size) (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache) dWrecur_ref = dWLSTM_ref[-hidden_size:, :] dWinput_ref = dWLSTM_ref[1:input_size + 1, :] db_ref = dWLSTM_ref[0, :] dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T # compare results neon_logger.display('Making sure neon LSTM match numpy LSTM in bprop') neon_logger.display('====Verifying update on W_recur====') assert allclose_with_out(dWrecur_neon, dWrecur_ref.T, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying update on W_input====') assert allclose_with_out(dWinput_neon, dWinput_ref.T, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying update on bias====') assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying output delta====') assert allclose_with_out(lstm.out_deltas_buffer.get(), dX_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('bprop is verified') return
def test_beamsearch(backend_default): """ Simlulated beam search on a minibatch of 2, for 4 time steps. The LSTM states are real but the "softmax outputs" z are hardcoded and not taken from the network. There are 6 tokens the network outputs, and they have probabilities like exp(1), exp(5), exp(7) The test asserts that the score_lists assigned by _beamsearch_step(z_list) are equal to the probabilities computed manually adding probabilities to z_list. """ be = backend_default batch_size = 2 be.bsz = batch_size time_steps = 4 nout = 6 num_beams = 3 # create unused layers activation = Tanh() gate_activation = Logistic() init_ary = np.eye(nout) init = Array(init_ary) encoder = LSTM(nout, init, activation=activation, gate_activation=gate_activation, name="Enc") decoder = LSTM(nout, init, activation=activation, gate_activation=gate_activation, name="Dec") class DummyFProp(): """ Constructs an artificial beam search example with known correct outputs. This is called inside a nested loop over steps, num_life. In the first time step there is one life beam, after that, 3 life beams per step. There are 4 time steps total. Each beamsearch_step builds one list over num_life beams. At t=0, the winners for ex0 are 1, 4, 5 (indexed by their position) and winners for ex1 are 2,4,5. From there we continue the beam for ex0: 12, 13, 14 6+2=8 6+3=9 6+2=8 40, 43, 45 with scores 5+4=9 5+3=8 5+7=12 three new winners 45, 52, 55 50, 52, 55 5+4=9 5+6=11 5+5=10 for ex2 1 4 5 with scores 5 4 7 we get the three winners 1, 4, 5 and continue (just taking the 3 in order, no sorting) 10 12 13 14 (not unique!) 5+2=7 5+2=7 5+3=8 41 42 43 with scores 4+6=10 4+5=9 4+7=11 winners 43 51 52 51 52 53 7+4=11 7+6=13 7+3=10 scores 11 11 13 continue from the three winners 43 51 52 431 433 434 11+10=21 11+3=14 11+9=20 511 512 513 with scores 11+6=17 11+5=16 11+7=18 winners 431 434 520 520 521 522 13+8=21 13+4=17 13+6=19 scores 21 20 21 continue from three winners 431 511 513 (going along beams, the matches in a beam) 4310 4312 4313 4314 21+2=23 21+2=23 21+3=24 21+10=31 (not unique!) 4341 4342 4343 with scores 20+10=30 20+5=25 20+7=27 winners 4314 4341 5204 5200 5202 5204 21+8=29 21+6=27 21+10=31 scores 31 30 31 overall winners are 4314 4341 5204 """ def __init__(self): self.i = -1 # t=0 # X x x <-- winners: 1, 4, 5 (for example 0) z = be.array( np.exp(np.array([[1, 6, 2, 1, 5, 5], [1, 5, 2, 2, 4, 7]]))).T # t=1 # x x x <-- give we picked 4: new winners 2,3,4 z1 = be.array( np.exp(np.array([[1, 1, 2, 3, 2, 1], [2, 1, 2, 3, 2, 1]]))).T # x x x <-- give we picked 5: # new winners 0,3,[5] # score 12 z2 = be.array( np.exp(np.array([[4, 1, 2, 3, 1, 7], [2, 6, 5, 7, 2, 4]]))).T # x X X <-- give we picked 1: # new winners 0,[2],[5] # scores 12, 11 z3 = be.array( np.exp(np.array([[4, 1, 6, 3, 1, 5], [1, 4, 6, 3, 2, 1]]))).T # t=2 # example 0: given constructed (1, 5), score 11: 3, 4; scores 21, 20 z4 = be.array( np.exp(np.array([[1, 1, 2, 10, 9, 1], [2, 10, 2, 3, 9, 1]]))).T # example 0: given constructed (5, 5), score 12: none selected from this beam z5 = be.array( np.exp(np.array([[4, 1, 2, 3, 1, 7], [2, 6, 5, 7, 2, 4]]))).T # example 0: given constructed (1, 2), score 12: 1; score 20 z6 = be.array( np.exp(np.array([[4, 8, 6, 3, 1, 5], [8, 4, 6, 3, 1, 1]]))).T # t=3 # example 0: given constructed (1, 5, 4), score 20: 1, score 30 z7 = be.array( np.exp(np.array([[1, 10, 2, 1, 1, 1], [2, 1, 2, 3, 10, 1]]))).T # example 0: given constructed (1, 2, 1), score 20: 5, score 30 z8 = be.array( np.exp(np.array([[4, 1, 2, 3, 1, 10], [2, 10, 5, 7, 2, 4]]))).T # example 0: given constructed (1, 5, 3), score 21: 4, score 31 z9 = be.array( np.exp(np.array([[4, 8, 6, 3, 10, 5], [8, 4, 6, 3, 10, 1]]))).T self.z_list = [z, z1, z2, z3, z4, z5, z6, z7, z8, z9] def fprop(self, z, inference=True, init_state=None): self.i += 1 return self.z_list[self.i] def final_state(): return be.zeros_like(decoder.h[-1]) class InObj(NervanaObject): def __init__(self): self.shape = (nout, time_steps) self.decoder_shape = (nout, time_steps) decoder.fprop = DummyFProp().fprop layers = Seq2Seq([encoder, decoder], decoder_connections=[0]) layers.decoder._recurrent[0].final_state = final_state in_obj = InObj() layers.configure(in_obj) # made zeros because zeros have shape layers.allocate() layers.allocate_deltas(None) beamsearch = BeamSearch(layers) inputs = be.iobuf(in_obj.shape) beamsearch.beamsearch(inputs, num_beams=num_beams) ex0 = np.array([[1, 5, 4, 1], [1, 2, 1, 5], [1, 5, 3, 4]]) ex1 = np.array([[5, 1, 4, 4], [5, 1, 1, 1], [5, 2, 0, 4]]) # extract all candidates examples = reformat_samples(beamsearch, num_beams, batch_size) assert allclose_with_out(examples[0], ex0) assert allclose_with_out(examples[1], ex1)
def test_dconv_rand(backend_default, rand_convargs, deltas_buffer): indim, nifm, fshape, nofm, batch_size, rngmax, w_rng = rand_convargs if isinstance(NervanaObject.be, NervanaGPU) and NervanaObject.be.compute_capability < (5, 0): if nofm % 4 != 0: pytest.skip( msg="C dim must be a multiple of 4 for Kepler bprop kernel") NervanaObject.be.bsz = batch_size dtypeu = np.float32 inp_rng = [0.0, rngmax] init_unif = Uniform(low=w_rng[0], high=w_rng[1]) inshape = (indim, indim, nifm) insize = np.prod(inshape) # generate neon deconv layer # need to switch to nofm here... neon_layer = Deconvolution(fshape=(fshape, fshape, nofm), strides=1, padding=0, init=init_unif) insize = np.prod(inshape) # generate reference deconv layer ref_layer = DeconvRefLayer(1, batch_size, identity, inshape[0], inshape[1:3], (fshape, fshape), nofm, 1, dtypeu) # setup input in range inp_rng inpa = np.random.random((insize, batch_size)) inpa *= (inp_rng[1] - inp_rng[0]) inpa += inp_rng[0] inpa = inpa.astype(dtypeu) inp = neon_layer.be.array(inpa) inp.lshape = inshape # run fprop on neon neon_layer.configure(inshape) neon_layer.prev_layer = True neon_layer.allocate() neon_out = neon_layer.fprop(inp).get() neon_layer.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() neon_layer.set_deltas(deltas_buffer) # pull neon weights into ref layer weights ref_layer.weights = neon_layer.W.get().T ref_out = np.copy(ref_layer.berror) # estimate the numerical precision ref_layer.fprop(inpa.T, permute=True) ref_out2 = ref_layer.berror atol = 10 * np.max(np.abs(ref_out - ref_out2)) assert allclose_with_out(ref_out.T, neon_out, atol=atol, rtol=0.0), \ '%e %e' % (np.max(np.abs(ref_out.T - neon_out)), atol) # generate err array erra = np.random.random(neon_out.shape) erra *= (inp_rng[1] - inp_rng[0]) erra += inp_rng[0] erra = erra.astype(dtypeu)
def check_rnn(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # ======== create models ======== # neon RNN rnn = Recurrent(hidden_size, init_func, activation=Tanh()) # reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size) Wxh = rnn_ref.Wxh Whh = rnn_ref.Whh bh = rnn_ref.bh # ========= generate data ================= # generate random input tensor inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = rnn.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape( seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape( seq_len, batch_size, hidden_size).swapaxes(1, 2) # ========= running models ========== # run neon fprop rnn.configure((input_size, seq_len)) rnn.prev_layer = True rnn.allocate() rnn.set_deltas([rnn.be.iobuf(rnn.in_shape)]) rnn.fprop(inpa) # weights are only initialized after doing fprop, so now # make ref weights and biases the same with neon model Wxh[:] = rnn.W_input.get() Whh[:] = rnn.W_recur.get() bh[:] = rnn.b.get() (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref) # now test the bprop rnn.bprop(rnn.be.array(deltas)) # grab the delta W from gradient buffer dWxh_neon = rnn.dW_input.get() dWhh_neon = rnn.dW_recur.get() db_neon = rnn.db.get() # comparing outputs neon_logger.display('====Verifying hidden states====') neon_logger.display(allclose_with_out(rnn.outputs.get(), h_ref_list, rtol=0.0, atol=1.0e-5)) neon_logger.display('fprop is verified') neon_logger.display('====Verifying update on W and b ====') neon_logger.display('dWxh') assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWhh') assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on bias====') neon_logger.display('db') assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('bprop is verified') return
def test_bibn(backend_default, fargs, deltas_buffer): seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) NervanaObject.be.bsz = batch_size hidden_size = min(10, hidden_size) # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiBNRNN(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() birnn.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() birnn.set_deltas(deltas_buffer) # test fprop # set the ff buffer inp_np = np.random.random(birnn.h_ff_buffer.shape) inp_be = birnn.be.array(inp_np) birnn.h_ff_buffer[:] = inp_np # compare the bn output with calling the backend bn xsum = birnn.be.zeros_like(birnn.xmean) xvar = birnn.be.zeros_like(birnn.xvar) gmean = birnn.be.zeros_like(birnn.gmean) gvar = birnn.be.zeros_like(birnn.gvar) gamma = birnn.be.ones(birnn.gamma.shape) beta = birnn.be.zeros_like(birnn.beta) grad_gamma = birnn.be.zeros_like(gamma) grad_beta = birnn.be.zeros_like(beta) out_ref = birnn.be.zeros_like(birnn.h_ff_buffer) xsum[:] = birnn.be.sum(birnn.h_ff_buffer, axis=1) birnn.be.compound_fprop_bn( birnn.h_ff_buffer, xsum, xvar, gmean, gvar, gamma, beta, out_ref, birnn.eps, birnn.rho, False, accumbeta=0, relu=False) # call the bibnrnn layer fprop_bn out_bn = birnn._fprop_bn(birnn.h_ff_buffer, inference=False) assert allclose_with_out(out_bn.get(), out_ref.get(), rtol=0.0, atol=1.0e-5) # test bprop err_np = np.random.random(birnn.h_ff_buffer.shape) err_be = birnn.be.array(err_np) err_out_ref = birnn.be.empty_like(err_be) birnn.be.compound_bprop_bn(err_out_ref, grad_gamma, grad_beta, err_be, inp_be, xsum, xvar, gamma, birnn.eps) err_out_bn = birnn._bprop_bn(err_be, out_bn) assert allclose_with_out(err_out_bn.get(), err_out_ref.get(), rtol=0.0, atol=2.5e-5)
def test_branch_model_fork_cpu(backend_cpu64): from neon.layers import BranchNode, Tree np.random.seed(0) be = NervanaObject.be be.bsz = 32 bnode = BranchNode() i1 = inception([(32,), (32, 32), ('max', 16)]) top1 = top_branch() top2 = top_branch() p1 = Sequential(main_branch() + [bnode, i1] + top1) p2 = [bnode] + top2 alpha2 = 0.3 neon_layer = Tree([p1, p2], alphas=[1.0, alpha2]) inshape = (4, 224, 224) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_layer.configure(inshape) inp = neon_layer.be.array(inpa) neon_layer.allocate() neon_layer.layers[0].layers[0].prev_layer = True neon_layer.allocate_deltas() neon_out_dev = neon_layer.fprop(inp) neon_out = [d.get() for d in neon_out_dev] # Now make the reference pathways: main_trunk2 = Sequential(main_branch()) main_trunk2.configure(inshape) main2 = main_trunk2.layers main2[0].prev_layer = True main2[0].deltas = be.iobuf(inshape) branch2 = Sequential(top_branch()) lbranch2 = branch2.layers (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)]) for bb in (b1, b2, b3, lbranch2): oshape = inshape for ll in main2 + bb: oshape = ll.configure(oshape) main1_trunk = neon_layer.layers[0].layers[:8] for ll, lo in zip(main2, main1_trunk): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) ll.allocate() temp_deltas = DeltasTree() temp_deltas.proc_layer(ll) temp_deltas.allocate_buffers() ll.set_deltas(temp_deltas) for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) for bb in (b1, b2, b3, lbranch2): for ll in bb: ll.allocate() temp_deltas = DeltasTree() temp_deltas.proc_layer(ll) temp_deltas.allocate_buffers() ll.set_deltas(temp_deltas) # Create the combined output buffer merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs) x = inp for ll in main2: x = ll.fprop(x) main2_out = x start = 0 for bb in (b1, b2, b3): xb = main2_out for ll in bb: xb = ll.fprop(xb) end = start + xb.shape[0] merge_output[start:end] = xb start = end x = merge_output top_trunk = Sequential(top1).layers for ll in top_trunk: x = ll.fprop(x) neon_out_ref = x.get() assert allclose_with_out(neon_out_ref, neon_out[0], rtol=0) # Now do second branch neon_out_ref2 = branch2.fprop(main2_out).get() assert allclose_with_out(neon_out_ref2, neon_out[1]) neon_logger.display("Beginning Back prop") erra = [np.random.random(d.shape) for d in neon_out] err = [be.array(d) for d in erra] neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape) neon_layer.bprop(err) bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get() middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get() err0 = err[0] for ll in reversed(top_trunk): err0 = ll.bprop(err0) err1 = err[1] for ll in reversed(lbranch2): err1 = ll.bprop(err1) for bb, errb in zip((b1, b2, b3), neon_layer.layers[0].layers[-5].error_views): for ll in reversed(bb): errb = ll.bprop(errb) # Now sum up the deltas at the root of the branch layer and compare ref_deltas = be.zeros_like(b1[0].deltas) ref_deltas[:] = alpha2 * lbranch2[0].deltas ref_deltas[:] = ref_deltas + b3[0].deltas + b2[0].deltas + b1[0].deltas neon_ref_deltas = ref_deltas.get() assert allclose_with_out(middle_neon_deltas, neon_ref_deltas, rtol=0) x = ref_deltas main2[0].deltas = be.iobuf(inshape) for ll in reversed(main2): x = ll.bprop(x) bottom_neon_ref_deltas = main2[1].deltas.get() assert allclose_with_out(bottom_neon_deltas, bottom_neon_ref_deltas, rtol=0)
def test_branch_model(backend_gpu): np.random.seed(0) be = NervanaObject.be be.bsz = 64 main1 = main_branch() i1 = inception([(32,), (32, 32), ('max', 16)]) top = top_branch() neon_layer = Sequential(main1 + i1 + top) inshape = (4, 224, 224) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_layer.configure(inshape) inp = neon_layer.be.array(inpa) neon_layer.allocate() neon_logger.display(neon_layer.nested_str()) neon_layer.layers[0].prev_layer = True neon_layer.allocate_deltas() neon_out = neon_layer.fprop(inp).get() # Now make the reference pathways: main_trunk2 = Sequential(main_branch()) main_trunk2.configure(inshape) main2 = main_trunk2.layers main2[0].prev_layer = True main2[0].deltas = be.iobuf(inshape) (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)]) for bb in (b1, b2, b3): oshape = inshape for ll in main2 + bb: oshape = ll.configure(oshape) main1_trunk = neon_layer.layers[:8] for ll, lo in zip(main2, main1_trunk): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) ll.allocate() temp_buff = DeltasTree() ll.allocate_deltas(temp_buff) temp_buff.allocate_buffers() ll.set_deltas(temp_buff) for bb in (b1, b2, b3): for ll in bb: ll.allocate() temp_buff = DeltasTree() ll.allocate_deltas(temp_buff) temp_buff.allocate_buffers() ll.set_deltas(temp_buff) # Create the combined output buffer merge_output = be.empty_like(neon_layer.layers[8].outputs) x = inp for ll in main2: x = ll.fprop(x) start = 0 for bb in (b1, b2, b3): xb = x for ll in bb: xb = ll.fprop(xb) end = start + xb.shape[0] merge_output[start:end] = xb start = end x = merge_output top_trunk = Sequential(top).layers for ll in top_trunk: x = ll.fprop(x) neon_out_ref = x.get() assert allclose_with_out(neon_out, neon_out_ref, rtol=0) neon_logger.display("Beginning Back prop") erra = np.random.random(neon_out.shape) err = be.array(erra) for ll in reversed(neon_layer.layers[8:]): err = ll.bprop(err) neon_deltas = err.get() for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views): for ll in reversed(bb): errb = ll.bprop(errb) # Now sum up the deltas at the root of the branch layer and compare ref_deltas = be.zeros_like(b1[0].deltas) ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas neon_ref_deltas = ref_deltas.get() assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
def test_conv_rand(backend_default, rand_convargs, deltas_buffer): indim, nifm, fshape, nofm, batch_size, stride, rng_max, w_rng, pad = rand_convargs if isinstance(NervanaObject.be, NervanaGPU) and NervanaObject.be.compute_capability < (5, 0): if nifm % 4 != 0: pytest.skip(msg="C dim must be a multiple of 4 for Kepler bprop kernel") NervanaObject.be.bsz = batch_size inp_rng = [0.0, rng_max] dtypeu = np.float32 init_unif = Uniform(low=w_rng[0], high=w_rng[1]) inshape = (nifm, indim, indim) insize = np.prod(inshape) # generate neon conv layer neon_layer = Convolution(fshape=(fshape, fshape, nofm), strides=stride, padding=pad, init=init_unif) # generate the reference layer ref_layer = ConvLayerRef(1, batch_size, identity, inshape[0], inshape[1:3], (fshape, fshape), nofm, stride, dtypeu, padding=pad) # setup input in range inp_rng inpa = np.random.random((insize, batch_size)) inpa *= inp_rng[1] - inp_rng[0] inpa += inp_rng[0] inpa = inpa.astype(dtypeu) inp = neon_layer.be.array(inpa) inp.lshape = inshape # run fprop on neon neon_layer.configure(inshape) neon_layer.prev_layer = True neon_layer.allocate() neon_layer.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() neon_layer.set_deltas(deltas_buffer) neon_out = neon_layer.fprop(inp).get() # pull neon weights into ref layer weights ref_layer.weights = neon_layer.W.get().T ref_layer.fprop(inpa.T) ref_out = np.copy(ref_layer.y) # estimate the numerical precision by # permuting order of ops in ref layer # fprop calculation ref_layer.fprop(inpa.T, permute=True) ref_out_perm = ref_layer.y atol = 4 * np.max(np.abs(ref_out - ref_out_perm)) # compare ref and neon layer fprop outputs # using the empirically determined atol assert allclose_with_out(ref_out.T, neon_out, atol=atol, rtol=1.e-4) # generate random deltas array erra = np.random.random(neon_out.shape) erra *= (inp_rng[1] - inp_rng[0]) erra += inp_rng[0] erra = erra.astype(dtypeu) err = neon_layer.be.array(erra) # run neon bprop neon_deltas = neon_layer.bprop(err).get() neon_dW = neon_layer.dW.get() # run ref code bprop ref_layer.bprop(erra.T, 1.0) ref_deltas = np.copy(ref_layer.berror_nopad.T) ref_dW = np.copy(ref_layer.updates) # estimate precision using permutation # of operation order on ref layer code ref_layer.bprop(erra.T, 1.0, permute=True) ref_deltas_perm = ref_layer.berror_nopad.T ref_dW_perm = ref_layer.updates atol = 4 * np.max(np.abs(ref_deltas - ref_deltas_perm)) assert allclose_with_out(ref_deltas, neon_deltas, atol=atol, rtol=1.e-4) atol = 4 * np.max(np.abs(ref_dW - ref_dW_perm)) assert allclose_with_out(ref_dW.T, neon_dW, atol=atol, rtol=1.e-4) return
def check_gru(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0], add_init_state=False): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) slice_shape = (hidden_size, batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon GRU gru = GRU(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) # generate random input tensor inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inp_dev = gru.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # run neon fprop gru.configure((input_size, seq_len)) gru.prev_layer = True gru.allocate() test_buffer = DeltasTree() gru.allocate_deltas(test_buffer) test_buffer.allocate_buffers() gru.set_deltas(test_buffer) if add_init_state: init_state = np.random.rand(*slice_shape)*inp_moms[1] + inp_moms[0] init_state_dev = gru.be.array(init_state) gru.fprop(inp_dev, init_state=init_state_dev) else: gru.fprop(inp_dev) # reference numpy GRU gru_ref = RefGRU(input_size, hidden_size) WGRU = gru_ref.weights # make ref weights and biases the same with neon model r_range = list(range(hidden_size)) z_range = list(range(hidden_size, hidden_size * 2)) c_range = list(range(hidden_size * 2, hidden_size * 3)) WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range] WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range] WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range] WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range] WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range] WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range] WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range] WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range] WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range] # transpose input X and do fprop # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape( seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape( seq_len, batch_size, hidden_size).swapaxes(1, 2) if add_init_state: init_state_ref = init_state.copy() (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref, init_state_ref) else: (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref) neon_logger.display('====Verifying hidden states====') assert allclose_with_out(gru.outputs.get(), h_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('fprop is verified') # now test the bprop neon_logger.display('Making sure neon GRU matches numpy GRU in bprop') gru.bprop(gru.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = gru.dW_input.get() dWrecur_neon = gru.dW_recur.get() db_neon = gru.db.get() dWxr_neon = dWinput_neon[r_range] dWxz_neon = dWinput_neon[z_range] dWxc_neon = dWinput_neon[c_range] dWrr_neon = dWrecur_neon[r_range] dWrz_neon = dWrecur_neon[z_range] dWrc_neon = dWrecur_neon[c_range] dbr_neon = db_neon[r_range] dbz_neon = db_neon[z_range] dbc_neon = db_neon[c_range] drzc_neon = gru.rzhcan_delta_buffer.get() dr_neon = drzc_neon[r_range] dz_neon = drzc_neon[z_range] dc_neon = drzc_neon[c_range] dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr] dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz] dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc] dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr] dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz] dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc] dbr_ref = dWGRU_ref[gru_ref.dW_ind_br] dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz] dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc] # neon_logger.display '====Verifying hidden deltas ====' neon_logger.display('====Verifying r deltas ====') assert allclose_with_out(dr_neon, dr_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying z deltas ====') assert allclose_with_out(dz_neon, dz_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying hcan deltas ====') assert allclose_with_out(dc_neon, dc_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on W_input====') neon_logger.display('dWxr') assert allclose_with_out(dWxr_neon, dWxr_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWxz') assert allclose_with_out(dWxz_neon, dWxz_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWxc') assert allclose_with_out(dWxc_neon, dWxc_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on W_recur====') neon_logger.display('dWrr') assert allclose_with_out(dWrr_neon, dWrr_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWrz') assert allclose_with_out(dWrz_neon, dWrz_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWrc') assert allclose_with_out(dWrc_neon, dWrc_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on bias====') neon_logger.display('dbr') assert allclose_with_out(dbr_neon, dbr_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dbz') assert allclose_with_out(dbz_neon, dbz_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dbc') assert allclose_with_out(dbc_neon, dbc_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('bprop is verified') return
def checkSequentialMatchesBatch(): """ check LSTM I/O forward/backward interactions """ n, b, d = (5, 3, 4) # sequence length, batch size, hidden size input_size = 10 WLSTM = LSTM.init(input_size, d) # input size, hidden size X = np.random.randn(n, b, input_size) h0 = np.random.randn(b, d) c0 = np.random.randn(b, d) # sequential forward cprev = c0 hprev = h0 caches = [{} for t in range(n)] Hcat = np.zeros((n, b, d)) for t in range(n): xt = X[t:t + 1] _, cprev, hprev, cache = LSTM.forward(xt, WLSTM, cprev, hprev) caches[t] = cache Hcat[t] = hprev # sanity check: perform batch forward to check that we get the same thing H, _, _, batch_cache = LSTM.forward(X, WLSTM, c0, h0) assert allclose_with_out( H, Hcat), 'Sequential and Batch forward don' 't match!' # eval loss wrand = np.random.randn(*Hcat.shape) # loss = np.sum(Hcat * wrand) dH = wrand # get the batched version gradients BdX, BdWLSTM, Bdc0, Bdh0 = LSTM.backward(dH, batch_cache) # now perform sequential backward dX = np.zeros_like(X) dWLSTM = np.zeros_like(WLSTM) dc0 = np.zeros_like(c0) dh0 = np.zeros_like(h0) dcnext = None dhnext = None for t in reversed(range(n)): dht = dH[t].reshape(1, b, d) dx, dWLSTMt, dcprev, dhprev = LSTM.backward(dht, caches[t], dcnext, dhnext) dhnext = dhprev dcnext = dcprev dWLSTM += dWLSTMt # accumulate LSTM gradient dX[t] = dx[0] if t == 0: dc0 = dcprev dh0 = dhprev # and make sure the gradients match neon_logger.display( 'Making sure batched version agrees with sequential version: ' '(should all be True)') neon_logger.display(np.allclose(BdX, dX)) neon_logger.display(np.allclose(BdWLSTM, dWLSTM)) neon_logger.display(np.allclose(Bdc0, dc0)) neon_logger.display(np.allclose(Bdh0, dh0))
def test_biRNN_fprop_rnn(backend_default, fargs, deltas_buffer): # basic sanity check with 0 weights random inputs seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) out_shape = (hidden_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiRNN(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() # setup the bi-directional rnn init_glorot = GlorotUniform() rnn = Recurrent(hidden_size, activation=Rectlinclip(slope=0), init=init_glorot) rnn.configure(in_shape) rnn.prev_layer = True rnn.allocate() # same weight for bi-rnn backward and rnn weights nout = hidden_size birnn.W_input_b[:] = birnn.W_input_f birnn.W_recur_b[:] = birnn.W_recur_f birnn.b_b[:] = birnn.b_f birnn.dW[:] = 0 rnn.W_input[:] = birnn.W_input_f rnn.W_recur[:] = birnn.W_recur_f rnn.b[:] = birnn.b_f rnn.dW[:] = 0 # inputs - random and flipped left-to-right inputs lr = np.random.random((input_size, seq_len * batch_size)) lr_rev = list(reversed(get_steps(lr.copy(), in_shape))) rl = con(lr_rev, axis=1) inp_lr = birnn.be.array(lr) inp_rl = birnn.be.array(rl) inp_rnn = rnn.be.array(lr) # outputs out_lr = birnn.fprop(inp_lr).get().copy() birnn.h_buffer[:] = 0 out_rl = birnn.fprop(inp_rl).get() out_rnn = rnn.fprop(inp_rnn).get().copy() # views out_lr_f_s = get_steps(out_lr[:nout], out_shape) out_lr_b_s = get_steps(out_lr[nout:], out_shape) out_rl_f_s = get_steps(out_rl[:nout], out_shape) out_rl_b_s = get_steps(out_rl[nout:], out_shape) out_rnn_s = get_steps(out_rnn, out_shape) # asserts for fprop for x_rnn, x_f, x_b, y_f, y_b in zip(out_rnn_s, out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s), reversed(out_rl_b_s)): assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5) assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5) assert allclose_with_out(x_rnn, x_f, rtol=0.0, atol=1.0e-5) assert allclose_with_out(x_rnn, y_b, rtol=0.0, atol=1.0e-5)
def test_roipooling_bprop_random(backend_default, fargs): rois_per_image, img_fm_c, img_fm_h, img_fm_w, roi_size, bsz = fargs rois_per_batch = rois_per_image * bsz # generate a random feature map and some random ROIs feature_map_size = img_fm_c * img_fm_h * img_fm_w * bsz feature_maps = np.array(list(range(feature_map_size))).reshape( (img_fm_c, img_fm_h, img_fm_w, bsz)) input_errors = np.zeros((img_fm_c, roi_size, roi_size, rois_per_batch)) range_num = roi_size * roi_size input_errors[0, :, :, rois_per_batch - 1] = np.array(list( range(range_num))).reshape(input_errors[0, :, :, rois_per_batch - 1].shape) rois_idx = np.vstack( [i * np.ones((rois_per_image, 1)) for i in range(bsz)]) rois = np.random.random((rois_per_batch, 4)) * min(img_fm_h, img_fm_w) # use full frame as ROI rois = np.zeros((rois_per_batch, 4)) rois[:, 0] = np.ones((rois_per_batch, )) rois[:, 1] = np.ones((rois_per_batch, )) rois[:, 2] = np.ones((rois_per_batch, )) * img_fm_w / spatial_scale rois[:, 3] = np.ones((rois_per_batch, )) * img_fm_w / spatial_scale rois = np.hstack((rois_idx, rois)) # run the numpy roi fprop (function inside this test script) outputs_np = bprop_roipooling_ref(feature_maps, rois, input_errors, img_fm_c, img_fm_h, img_fm_w, bsz, rois_per_image, roi_size, roi_size) # call backend roipooling kernel NervanaObject.be.bsz = bsz be = NervanaObject.be input_dev = be.array(feature_maps) rois_dev = be.array(rois) output_shape = (img_fm_c, roi_size, roi_size, rois_per_batch) outputs_dev = be.zeros(output_shape, dtype=np.float32) # make sure the type being int argmax_dev = be.zeros(output_shape, dtype=np.int32) input_error_dev = be.array(input_errors) output_error_dev = be.zeros(feature_maps.shape) be.roipooling_fprop(input_dev, rois_dev, outputs_dev, argmax_dev, rois_per_batch, img_fm_c, img_fm_h, img_fm_w, roi_size, roi_size, spatial_scale) start_time = timeit() be.roipooling_bprop(input_error_dev, rois_dev, output_error_dev, argmax_dev, rois_per_batch, img_fm_c, img_fm_h, img_fm_w, roi_size, roi_size, spatial_scale) neon_logger.display( "Nervana backend roipooling bprop (sec): {}".format(timeit() - start_time)) assert output_error_dev.get().reshape(img_fm_c, img_fm_h, img_fm_w, bsz)[:, :, :, 0].sum() == 0 assert output_error_dev.get().reshape(img_fm_c, img_fm_h, img_fm_w, bsz)[:, :, :, -1].sum() != 0 assert output_error_dev.get().sum() == input_errors.sum() outputs_be = output_error_dev.get() assert allclose_with_out(outputs_np, outputs_be, atol=1e-6, rtol=0)
def test_conv_layer(fargs_tests, backend_pair): dtype = np.float32 ng, nc = backend_pair if ng.compute_capability < (5, 0): pytest.skip(msg="Test requires Maxwell or higher") N, C, K = fargs_tests[0] D, H, W = fargs_tests[1] T, R, S = fargs_tests[2] padding_d, padding_h, padding_w = fargs_tests[3] strides_d, strides_h, strides_w = fargs_tests[4] conv_ng = ng.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) conv_nc = nc.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) assert conv_nc.dimI == conv_ng.dimI assert conv_nc.dimF == conv_ng.dimF assert conv_nc.dimO == conv_ng.dimO assert conv_nc.M == conv_ng.M dimI = conv_ng.dimI dimF = conv_ng.dimF dimO = conv_ng.dimO if any(np.array(dimO) <= 0): return # cpu input arrays cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32) cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy cpuI[-1, :] = 0.0 # =======GPU and CPU========== beI = cpuI[:-1, :].reshape(dimI) beF = cpuF.reshape(dimF) beE = cpuE start_gpu = default_timer() ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype) end_gpu = default_timer() start_cpu = default_timer() ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype) end_cpu = default_timer() neon_logger.display("gputime: %s, cputime %s" % (end_gpu - start_gpu, end_cpu - start_cpu)) # ======numpy=========== # cpu output arrays cpuO = np.zeros(dimO, dtype=dtype) cpuB = np.zeros(slicable(dimI, 1), dtype=dtype) cpuU = np.zeros(slicable(dimF), dtype=dtype) D, H, W = conv_nc.DHW T, R, S = conv_nc.TRS M, P, Q = conv_nc.MPQ pad_d, pad_h, pad_w = conv_nc.padding str_d, str_h, str_w = conv_nc.strides for m in range(M): mt = m * str_d - pad_d for p in range(P): pr = p * str_h - pad_h for q in range(Q): qs = q * str_w - pad_w idx = pixel_indices(conv_nc, mt, pr, qs) cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :]) cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :]) cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T) for op, ngA, ncA, cpuA, w in (("fprop", ngO, ncO, cpuO, Q), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W), ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)): neon_logger.display(op) ncAnp = ncA.get().astype(np.float32) ngAnp = ngA.get().astype(np.float32) ncdif = cpuA - ncAnp ngdif = cpuA - ngAnp maxval = abs(cpuA).max() ncmaxdif = abs(ncdif).max() ngmaxdif = abs(ngdif).max() ncRatio = ncmaxdif / float(maxval) ngRatio = ngmaxdif / float(maxval) assert ncRatio < 1e-5 assert ngRatio < 1e-5 assert allclose_with_out(ncA.get(), cpuA, rtol=0, atol=1e-4) assert allclose_with_out(ngA.get(), cpuA, rtol=0, atol=1e-3)
def mergesum_test_config(be, modfunc, use_stride=1): l1 = Conv(**conv_params(3, 16)) neon_layer = modfunc(16, use_stride) inshape = (16, 32, 32) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_seq = Sequential([l1] + neon_layer) neon_seq.configure(inshape) inp = be.array(inpa) neon_seq.allocate() # neon_layer.layers[0].prev_layer = True neon_seq.allocate_deltas() neon_out = neon_seq.fprop(inp).get() # Now make the reference pathways: p1, p2 = module_factory_copy(neon_layer, modfunc, 16, use_stride) l11 = Conv(**conv_params(3, 16)) l12 = Conv(**conv_params(3, 16)) for ll in (l11, l12): for lcopy, lref in zip(ll, l1): if lcopy.has_params: lcopy.set_params(lref.get_params_serialize()) path1 = Sequential([l11] + p1) path2 = Sequential([l12] + p2) for ll in (path1, path2): ll.configure(inshape) ll.allocate() ll.allocate_deltas() o1 = path1.fprop(inp) o2 = path2.fprop(inp) # convert mkl buffer to cpu for following cpu execution be.convert_data(o1, False) be.convert_data(o2, False) neon_out_ref = be.empty_like(o1) neon_out_ref[:] = be.maximum(o1 + o2, 0) # need to have bsum false for this test to be valid assert allclose_with_out(neon_out_ref.get(), neon_out, rtol=0) erra = np.random.random(neon_out.shape) err = be.array(erra) ebr = neon_seq.layers[-1].bprop(err) ebr = neon_seq.layers[-2].bprop(ebr) trunk_neon = ebr.get() err = be.array(erra) err[:] = be.greater(neon_out_ref, 0) * err pstart = len(l1) eb1 = err for l in reversed(path1.layers[pstart:]): eb1 = l.bprop(eb1) eb2 = err for l in reversed(path2.layers[pstart:]): eb2 = l.bprop(eb2) be.convert_data(eb1, False) be.convert_data(eb2, False) err_ref = be.empty_like(eb1) err_ref[:] = eb1 + eb2 assert allclose_with_out(err_ref.get(), trunk_neon, rtol=0)
def test_conv_ones(backend_default, ones_convargs, deltas_buffer): dtypeu = np.float32 indim, nifm, fshape, nofm, batch_size, stride, pad = ones_convargs if isinstance(NervanaObject.be, NervanaGPU) and NervanaObject.be.compute_capability < (5, 0): if nifm % 4 != 0: pytest.skip( msg="C dim must be a multiple of 4 for Kepler bprop kernel") NervanaObject.be.bsz = batch_size # weights set to one init_unif = Uniform(low=1.0, high=1.0) inshape = (nifm, indim, indim) insize = np.prod(inshape) neon_layer = Convolution(fshape=(fshape, fshape, nofm), strides=stride, padding=pad, init=init_unif) inp = neon_layer.be.array(np.ones((insize, batch_size))) inp.lshape = inshape neon_layer.configure(inshape) neon_layer.prev_layer = True neon_layer.allocate() neon_layer.allocate_deltas(deltas_buffer) deltas_buffer.allocate_buffers() neon_layer.set_deltas(deltas_buffer) # run fprop out = neon_layer.fprop(inp).get() # generate the reference layer ref_layer = ConvLayerRef(1, batch_size, identity, inshape[0], inshape[1:3], (fshape, fshape), nofm, stride, dtypeu, padding=pad) # init weights to ones ref_layer.weights = np.ones(neon_layer.W.shape).T.astype(dtypeu) ref_layer.fprop(inp.get().T) out_exp = ref_layer.y.copy() assert allclose_with_out(out_exp.T, out, atol=0.0, rtol=0.0) # generate err array err = np.ones(out.shape).astype(np.float32) # run bprop neon_layer.bprop(neon_layer.be.array(err)) dw = neon_layer.dW.get() # run bprop ref_layer.bprop(err.T.astype(dtypeu), 1.0) # expected output for updates is uniform matrix with # all elements == ofmsize*batch_size updates_exp = ref_layer.updates.T # check dw from neon layer assert allclose_with_out(dw, updates_exp, atol=0.0, rtol=0.0) # the deltas are more complicated since the matricies are not # uniform, going to use the reference code directly here # no tolerance here should be exact dd = np.abs(ref_layer.berror_nopad.T - neon_layer.deltas.get()) try: assert np.max(dd) == 0.0 except AssertionError: if ones_convargs in ((32, 32, 3, 32, 64, 2, 0), (32, 32, 3, 16, 64, 2, 0), (32, 32, 3, 64, 64, 2, 0)): pytest.xfail(reason="xfail before mkl update. issue: #1020") else: assert np.max(dd) == 0.0 return
def gradient_check_ref(seq_len, input_size, hidden_size, batch_size, epsilon=1.0e-5, dtypeu=np.float64, threshold=1e-4): # this is a check of the reference code itself # estimates the gradients by adding perturbations # to the input and the weights and compares to # the values calculated in bprop # generate sparse random input matrix NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (seq_len, input_size, batch_size) # hidden_shape = (seq_len, hidden_size, batch_size) (inp_bl, nz_inds) = sparse_rand(input_shape, frac=1.0 / float(input_shape[1])) inp_bl = np.random.randn(*input_shape) # convert input matrix from neon to ref code format inp_bl = inp_bl.swapaxes(1, 2).astype(dtypeu) # generate reference LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size).astype(dtypeu) # init parameters as done for neon WLSTM = np.random.randn(*WLSTM.shape) (Hout, cprev, hprev, cache) = lstm_ref.forward(inp_bl, WLSTM) # scale Hout by random matrix... rand_scale = np.random.random(Hout.shape) * 2.0 - 1.0 rand_scale = dtypeu(rand_scale) # line below would be the loss function # loss_bl = np.sum(rand_scale * Hout) # run bprop, input deltas is rand_scale (dX_bl, dWLSTM_bl, dc0, dh0) = lstm_ref.backward(rand_scale, cache) grads_est = np.zeros(dX_bl.shape) inp_pert = inp_bl.copy() for pert_ind in range(inp_bl.size): save_val = inp_pert.flat[pert_ind] # add/subtract perturbations to input inp_pert.flat[pert_ind] = save_val + epsilon # and run fprop on perturbed input (Hout_pos, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM) inp_pert.flat[pert_ind] = save_val - epsilon (Hout_neg, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM) # calculate the loss on outputs loss_pos = np.sum(rand_scale * Hout_pos) loss_neg = np.sum(rand_scale * Hout_neg) grads_est.flat[pert_ind] = 0.5 / float(epsilon) * (loss_pos - loss_neg) # reset input inp_pert.flat[pert_ind] = save_val # assert that gradient estimates within rel threshold of # bprop calculated deltas assert allclose_with_out(grads_est, dX_bl, rtol=threshold, atol=0.0) return
def compare_helper_cpu(op, inA, inB, nc, dtype): numpy_result = math_helper(np, op, inA, inB, dtype=np.float32).astype(dtype) nervanaCPU_result = math_helper(nc, op, inA, inB, dtype=dtype).get() allclose_with_out(numpy_result, nervanaCPU_result, rtol=0, atol=1e-5)
def test_gpu_pool_layer(poolargs, backend_pair_bench): op = poolargs[0] dtype = np.float32 ng, nc = backend_pair_bench N, C = 32, 32 D, H, W = 1, 32, 32 J, T, R, S = 2, 1, 3, 3 padding_j, padding_d, padding_h, padding_w = 0, 0, 0, 0 strides_j, strides_d, strides_h, strides_w = 2, 1, 2, 2 pool_ng = ng.pool_layer( dtype, op, N, C, D, H, W, J, T, R, S, padding_j, padding_d, padding_h, padding_w, strides_j, strides_d, strides_h, strides_w) pool_nc = nc.pool_layer( dtype, op, N, C, D, H, W, J, T, R, S, padding_j, padding_d, padding_h, padding_w, strides_j, strides_d, strides_h, strides_w) assert pool_ng.dimI == pool_nc.dimI assert pool_ng.dimO == pool_nc.dimO dimI = pool_ng.dimI dimO = pool_ng.dimO # generating input arrays for inputs and errors cpuI = np.random.uniform(0.0, 1.0, sliceable(dimI, 1)).astype( np.float16).astype(dtype) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(dtype) # zero pad the last row of cpu input for the sake of numpy if op == "max": cpuI[-1, :] = np.finfo(dtype).min else: cpuI[-1, :] = 0 # =========GPU and numpy ========== beI = cpuI[:-1, :].reshape(dimI) beE = cpuE ngO, ngB = run_backend_pool(ng, pool_ng, beI, beE, dtype) cpuO, cpuB = run_numpy_pool(op, cpuI, cpuE, dtype, pool_ng) for opA, ngA, cpuA in ( ("fprop", ngO, cpuO), ("bprop", ngB, cpuB[:-1, :].reshape(dimI))): neon_logger.display(opA) assert allclose_with_out(ngA.get(), cpuA, rtol=0, atol=1e-4)