예제 #1
0
def gradient_calc(seq_len, input_size, hidden_size, batch_size,
                  epsilon=None, rand_scale=None, inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon rnn instance
    rnn = Recurrent(hidden_size, Gaussian(), Tanh())
    inpa = rnn.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    rnn.configure((input_size, seq_len))
    rnn.prev_layer = True
    rnn.allocate()
    rnn.set_deltas([rnn.be.iobuf(rnn.in_shape)])
    out_bl = rnn.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = rnn.bprop(rnn.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_rnn(rnn)
        rnn.allocate()
        out_pos = rnn.fprop(rnn.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_rnn(rnn)
        rnn.allocate()
        out_neg = rnn.fprop(rnn.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale*out_pos)
        loss_neg = np.sum(rand_scale*out_neg)
        # compute the gradient estimate
        grad = 0.5*(loss_pos-loss_neg)/epsilon

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del rnn
    return (grads_est, deltas_neon)
예제 #2
0
def gradient_calc(seq_len, input_size, hidden_size, batch_size,
                  epsilon=None, rand_scale=None, inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon rnn instance
    rnn = Recurrent(hidden_size, Gaussian(), activation=Tanh())
    inpa = rnn.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    rnn.configure((input_size, seq_len))
    rnn.prev_layer = True
    rnn.allocate()
    rnn.set_deltas([rnn.be.iobuf(rnn.in_shape)])
    out_bl = rnn.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = rnn.bprop(rnn.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_rnn(rnn)
        rnn.allocate()
        out_pos = rnn.fprop(rnn.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_rnn(rnn)
        rnn.allocate()
        out_neg = rnn.fprop(rnn.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale * out_pos)
        loss_neg = np.sum(rand_scale * out_neg)
        # compute the gradient estimate
        grad = 0.5 * (loss_pos - loss_neg) / epsilon

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del rnn
    return (grads_est, deltas_neon)
예제 #3
0
def check_rnn(seq_len,
              input_size,
              hidden_size,
              batch_size,
              init_func,
              inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # ======== create models ========
    # neon RNN
    rnn = Recurrent(hidden_size, init_func, Tanh())

    # reference numpy RNN
    rnn_ref = RefRecurrent(input_size, hidden_size)
    Wxh = rnn_ref.Wxh
    Whh = rnn_ref.Whh
    bh = rnn_ref.bh

    # ========= generate data =================
    # generate random input tensor
    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = rnn.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(seq_len, batch_size,
                                   input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size,
                                         hidden_size).swapaxes(1, 2)

    # ========= running models ==========
    # run neon fprop
    rnn.fprop(inpa)

    # weights are only initialized after doing fprop, so now
    # make ref weights and biases the same with neon model
    Wxh[:] = rnn.W_input.get()
    Whh[:] = rnn.W_recur.get()
    bh[:] = rnn.b.get()

    (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list,
     d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref)

    # now test the bprop
    rnn.bprop(rnn.be.array(deltas))
    # grab the delta W from gradient buffer
    dWxh_neon = rnn.dW_input.get()
    dWhh_neon = rnn.dW_recur.get()
    db_neon = rnn.db.get()

    # comparing outputs
    print '====Verifying hidden states===='
    print allclose_with_out(rnn.h_buffer.get(),
                            h_ref_list,
                            rtol=0.0,
                            atol=1.0e-5)
    print 'fprop is verified'

    print '====Verifying update on W and b ===='
    print 'dWxh'
    assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5)
    print 'dWhh'
    assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying update on bias===='
    print 'db'
    assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5)

    print 'bprop is verified'

    return
예제 #4
0
def check_rnn(seq_len, input_size, hidden_size,
              batch_size, init_func, inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # ======== create models ========
    # neon RNN
    rnn = Recurrent(hidden_size, init_func, Tanh())

    # reference numpy RNN
    rnn_ref = RefRecurrent(input_size, hidden_size)
    Wxh = rnn_ref.Wxh
    Whh = rnn_ref.Whh
    bh = rnn_ref.bh

    # ========= generate data =================
    # generate random input tensor
    inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0]
    inpa = rnn.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(
        seq_len, batch_size, input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(
        seq_len, batch_size, hidden_size).swapaxes(1, 2)

    # ========= running models ==========
    # run neon fprop
    rnn.fprop(inpa)

    # weights are only initialized after doing fprop, so now
    # make ref weights and biases the same with neon model
    Wxh[:] = rnn.W_input.get()
    Whh[:] = rnn.W_recur.get()
    bh[:] = rnn.b.get()

    (dWxh_ref, dWhh_ref, db_ref, h_ref_list,
     dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref)

    # now test the bprop
    rnn.bprop(rnn.be.array(deltas))
    # grab the delta W from gradient buffer
    dWxh_neon = rnn.dW_input.get()
    dWhh_neon = rnn.dW_recur.get()
    db_neon = rnn.db.get()

    # comparing outputs
    print '====Verifying hidden states===='
    print allclose_with_out(rnn.h_buffer.get(),
                            h_ref_list,
                            rtol=0.0,
                            atol=1.0e-5)
    print 'fprop is verified'

    print '====Verifying update on W and b ===='
    print 'dWxh'
    assert allclose_with_out(dWxh_neon,
                             dWxh_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dWhh'
    assert allclose_with_out(dWhh_neon,
                             dWhh_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on bias===='
    print 'db'
    assert allclose_with_out(db_neon,
                             db_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print 'bprop is verified'

    return