Пример #1
0
    def allocate_deltas(self):
        if getattr(self, 'global_deltas', None) is None:
            self.global_deltas = DeltasTree()
            self.layers.allocate_deltas(self.global_deltas)

            # allocate the buffers now that all the
            # nesting and max sizes have been determined
            self.global_deltas.allocate_buffers(self.be)

        # set the deltas
        self.layers.set_deltas(self.global_deltas)
Пример #2
0
def deltas_buffer():
    # empty DeltasTree object for tests that need
    # to allocate shared deltas buffers
    return DeltasTree()
Пример #3
0
def gradient_calc(seq_len,
                  input_size,
                  hidden_size,
                  batch_size,
                  add_init_state=False,
                  epsilon=None,
                  rand_scale=None,
                  inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon gru instance
    gru = GRU(hidden_size,
              init=Gaussian(),
              activation=Tanh(),
              gate_activation=Logistic())
    inpa = gru.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    gru.configure((input_size, seq_len))
    gru.prev_layer = True
    gru.allocate()

    test_buffer = DeltasTree()
    gru.allocate_deltas(test_buffer)
    test_buffer.allocate_buffers()
    gru.set_deltas(test_buffer)

    if add_init_state is True:
        slice_shape = (hidden_size, batch_size)
        ini_s = np.random.randn(*slice_shape)
        ini_s_dev = gru.be.array(ini_s.copy())
        out_bl = gru.fprop(inpa, ini_s_dev).get()
    else:
        out_bl = gru.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = gru.bprop(gru.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_gru(gru)
        gru.allocate()
        if add_init_state is True:
            ini_s_dev = gru.be.array(ini_s.copy())
            out_pos = gru.fprop(gru.be.array(inp_pert), ini_s_dev).get()
        else:
            out_pos = gru.fprop(gru.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_gru(gru)
        gru.allocate()
        if add_init_state is True:
            ini_s_dev = gru.be.array(ini_s.copy())
            out_neg = gru.fprop(gru.be.array(inp_pert), ini_s_dev).get()
        else:
            out_neg = gru.fprop(gru.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale * out_pos)
        loss_neg = np.sum(rand_scale * out_neg)
        # compute the gradient estimate
        grad = 0.5 / float(epsilon) * (loss_pos - loss_neg)

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del gru
    return (grads_est, deltas_neon)
Пример #4
0
def check_gru(seq_len,
              input_size,
              hidden_size,
              batch_size,
              init_func,
              inp_moms=[0.0, 1.0],
              add_init_state=False):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    slice_shape = (hidden_size, batch_size)

    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon GRU
    gru = GRU(hidden_size,
              init_func,
              activation=Tanh(),
              gate_activation=Logistic())

    # generate random input tensor
    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inp_dev = gru.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # run neon fprop
    gru.configure((input_size, seq_len))
    gru.prev_layer = True
    gru.allocate()

    test_buffer = DeltasTree()
    gru.allocate_deltas(test_buffer)
    test_buffer.allocate_buffers()
    gru.set_deltas(test_buffer)

    if add_init_state:
        init_state = np.random.rand(*slice_shape) * inp_moms[1] + inp_moms[0]
        init_state_dev = gru.be.array(init_state)
        gru.fprop(inp_dev, init_state=init_state_dev)
    else:
        gru.fprop(inp_dev)

    # reference numpy GRU
    gru_ref = RefGRU(input_size, hidden_size)
    WGRU = gru_ref.weights

    # make ref weights and biases the same with neon model
    r_range = list(range(hidden_size))
    z_range = list(range(hidden_size, hidden_size * 2))
    c_range = list(range(hidden_size * 2, hidden_size * 3))

    WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range]
    WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range]
    WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range]

    WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range]
    WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range]
    WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range]

    WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range]
    WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range]
    WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range]

    # transpose input X and do fprop
    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(seq_len, batch_size,
                                   input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size,
                                         hidden_size).swapaxes(1, 2)

    if add_init_state:
        init_state_ref = init_state.copy()
        (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list,
         dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref, init_state_ref)
    else:
        (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list,
         dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref)

    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(gru.outputs.get(),
                             h_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('fprop is verified')

    # now test the bprop
    neon_logger.display('Making sure neon GRU matches numpy GRU in bprop')
    gru.bprop(gru.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = gru.dW_input.get()
    dWrecur_neon = gru.dW_recur.get()
    db_neon = gru.db.get()
    dWxr_neon = dWinput_neon[r_range]
    dWxz_neon = dWinput_neon[z_range]
    dWxc_neon = dWinput_neon[c_range]
    dWrr_neon = dWrecur_neon[r_range]
    dWrz_neon = dWrecur_neon[z_range]
    dWrc_neon = dWrecur_neon[c_range]
    dbr_neon = db_neon[r_range]
    dbz_neon = db_neon[z_range]
    dbc_neon = db_neon[c_range]

    drzc_neon = gru.rzhcan_delta_buffer.get()
    dr_neon = drzc_neon[r_range]
    dz_neon = drzc_neon[z_range]
    dc_neon = drzc_neon[c_range]

    dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr]
    dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz]
    dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc]
    dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr]
    dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz]
    dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc]
    dbr_ref = dWGRU_ref[gru_ref.dW_ind_br]
    dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz]
    dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc]

    # neon_logger.display '====Verifying hidden deltas ===='
    neon_logger.display('====Verifying r deltas ====')
    assert allclose_with_out(dr_neon, dr_ref_list, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying z deltas ====')
    assert allclose_with_out(dz_neon, dz_ref_list, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying hcan deltas ====')
    assert allclose_with_out(dc_neon, dc_ref_list, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on W_input====')
    neon_logger.display('dWxr')
    assert allclose_with_out(dWxr_neon, dWxr_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWxz')
    assert allclose_with_out(dWxz_neon, dWxz_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWxc')
    assert allclose_with_out(dWxc_neon, dWxc_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on W_recur====')

    neon_logger.display('dWrr')
    assert allclose_with_out(dWrr_neon, dWrr_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWrz')
    assert allclose_with_out(dWrz_neon, dWrz_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWrc')
    assert allclose_with_out(dWrc_neon, dWrc_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on bias====')
    neon_logger.display('dbr')
    assert allclose_with_out(dbr_neon, dbr_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dbz')
    assert allclose_with_out(dbz_neon, dbz_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dbc')
    assert allclose_with_out(dbc_neon, dbc_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('bprop is verified')

    return
def test_branch_model(backend_gpu):
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 64
    main1 = main_branch()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top = top_branch()
    neon_layer = Sequential(main1 + i1 + top)

    inshape = (4, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)
    neon_layer.allocate()
    neon_logger.display(neon_layer.nested_str())
    neon_layer.layers[0].prev_layer = True

    neon_layer.allocate_deltas()

    neon_out = neon_layer.fprop(inp).get()

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].deltas = be.iobuf(inshape)
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[:6]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get(), 'weight_bias': lo.weight_bias.get()}})
        ll.allocate()

        temp_buff = DeltasTree()
        ll.allocate_deltas(temp_buff)
        temp_buff.allocate_buffers()
        ll.set_deltas(temp_buff)

    for bb in (b1, b2, b3):
        for ll in bb:
            ll.allocate()
            temp_buff = DeltasTree()
            ll.allocate_deltas(temp_buff)
            temp_buff.allocate_buffers()
            ll.set_deltas(temp_buff)

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[6].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)

    start = 0
    for bb in (b1, b2, b3):
        xb = x
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out, neon_out_ref, rtol=0)

    neon_logger.display("Beginning Back prop")
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)
    for ll in reversed(neon_layer.layers[6:]):
        err = ll.bprop(err)

    neon_deltas = err.get()
    for bb, errb in zip((b1, b2, b3), neon_layer.layers[6].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas

    neon_ref_deltas = ref_deltas.get()

    assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
def test_branch_model_fork_cpu(backend_cpu64):
    from neon.layers import BranchNode, Tree
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 32
    bnode = BranchNode()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top1 = top_branch()
    top2 = top_branch()
    p1 = Sequential(main_branch() + [bnode, i1] + top1)
    p2 = [bnode] + top2

    alpha2 = 0.3
    neon_layer = Tree([p1, p2], alphas=[1.0, alpha2])

    inshape = (4, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()

    neon_layer.layers[0].layers[0].prev_layer = True
    neon_layer.allocate_deltas()

    neon_out_dev = neon_layer.fprop(inp)
    neon_out = [d.get() for d in neon_out_dev]

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].deltas = be.iobuf(inshape)

    branch2 = Sequential(top_branch())
    lbranch2 = branch2.layers
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3, lbranch2):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[0].layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        temp_deltas = DeltasTree()
        temp_deltas.proc_layer(ll)
        temp_deltas.allocate_buffers()
        ll.set_deltas(temp_deltas)

    for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})

    for bb in (b1, b2, b3, lbranch2):
        for ll in bb:
            ll.allocate()
            temp_deltas = DeltasTree()
            temp_deltas.proc_layer(ll)
            temp_deltas.allocate_buffers()
            ll.set_deltas(temp_deltas)

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)
    main2_out = x

    start = 0
    for bb in (b1, b2, b3):
        xb = main2_out
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top1).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out_ref, neon_out[0], rtol=0)

    # Now do second branch
    neon_out_ref2 = branch2.fprop(main2_out).get()
    assert allclose_with_out(neon_out_ref2, neon_out[1])

    neon_logger.display("Beginning Back prop")
    erra = [np.random.random(d.shape) for d in neon_out]
    err = [be.array(d) for d in erra]
    neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape)
    neon_layer.bprop(err)

    bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get()
    middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get()

    err0 = err[0]
    for ll in reversed(top_trunk):
        err0 = ll.bprop(err0)

    err1 = err[1]
    for ll in reversed(lbranch2):
        err1 = ll.bprop(err1)

    for bb, errb in zip((b1, b2, b3), neon_layer.layers[0].layers[-5].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = alpha2 * lbranch2[0].deltas
    ref_deltas[:] = ref_deltas + b3[0].deltas + b2[0].deltas + b1[0].deltas
    neon_ref_deltas = ref_deltas.get()
    assert allclose_with_out(middle_neon_deltas, neon_ref_deltas, rtol=0)

    x = ref_deltas
    main2[0].deltas = be.iobuf(inshape)

    for ll in reversed(main2):
        x = ll.bprop(x)

    bottom_neon_ref_deltas = main2[1].deltas.get()
    assert allclose_with_out(bottom_neon_deltas, bottom_neon_ref_deltas, rtol=0)
Пример #7
0
def gradient_calc(seq_len, input_size, hidden_size, batch_size,
                  epsilon=None, rand_scale=None, inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon rnn instance
    rnn = Recurrent(hidden_size, Gaussian(), activation=Tanh())
    inpa = rnn.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    rnn.configure((input_size, seq_len))
    rnn.prev_layer = True
    rnn.allocate()

    dtree = DeltasTree()
    rnn.allocate_deltas(dtree)
    dtree.allocate_buffers()
    rnn.set_deltas(dtree)

    out_bl = rnn.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = rnn.bprop(rnn.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_rnn(rnn)
        rnn.allocate()
        out_pos = rnn.fprop(rnn.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_rnn(rnn)
        rnn.allocate()
        out_neg = rnn.fprop(rnn.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale * out_pos)
        loss_neg = np.sum(rand_scale * out_neg)
        # compute the gradient estimate
        grad = 0.5 * (loss_pos - loss_neg) / epsilon

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del rnn
    return (grads_est, deltas_neon)
Пример #8
0
def check_rnn(seq_len,
              input_size,
              hidden_size,
              batch_size,
              init_func,
              inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # ======== create models ========
    # neon RNN
    rnn = Recurrent(hidden_size, init_func, activation=Tanh())

    # reference numpy RNN
    rnn_ref = RefRecurrent(input_size, hidden_size)
    Wxh = rnn_ref.Wxh
    Whh = rnn_ref.Whh
    bh = rnn_ref.bh

    # ========= generate data =================
    # generate random input tensor
    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = rnn.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(seq_len, batch_size,
                                   input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size,
                                         hidden_size).swapaxes(1, 2)

    # ========= running models ==========
    # run neon fprop
    rnn.configure((input_size, seq_len))
    rnn.prev_layer = True
    rnn.allocate()

    dtree = DeltasTree()
    rnn.allocate_deltas(dtree)
    dtree.allocate_buffers()
    rnn.set_deltas(dtree)

    rnn.fprop(inpa)

    # weights are only initialized after doing fprop, so now
    # make ref weights and biases the same with neon model
    Wxh[:] = rnn.W_input.get()
    Whh[:] = rnn.W_recur.get()
    bh[:] = rnn.b.get()

    (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list,
     d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref)

    # now test the bprop
    rnn.bprop(rnn.be.array(deltas))
    # grab the delta W from gradient buffer
    dWxh_neon = rnn.dW_input.get()
    dWhh_neon = rnn.dW_recur.get()
    db_neon = rnn.db.get()

    # comparing outputs
    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(rnn.outputs.get(),
                             h_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('fprop is verified')

    neon_logger.display('====Verifying update on W and b ====')
    neon_logger.display('dWxh')
    assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWhh')
    assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on bias====')
    neon_logger.display('db')
    assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('bprop is verified')

    return
Пример #9
0
def test_branch_model(backend_gpu):
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 64
    main1 = main_branch()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top = top_branch()
    neon_layer = Sequential(main1 + i1 + top)

    inshape = (4, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)
    neon_layer.allocate()
    neon_logger.display(neon_layer.nested_str())
    neon_layer.layers[0].prev_layer = True

    neon_layer.allocate_deltas()

    neon_out = neon_layer.fprop(inp).get()

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].deltas = be.iobuf(inshape)
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()

        temp_buff = DeltasTree()
        ll.allocate_deltas(temp_buff)
        temp_buff.allocate_buffers()
        ll.set_deltas(temp_buff)

    for bb in (b1, b2, b3):
        for ll in bb:
            ll.allocate()
            temp_buff = DeltasTree()
            ll.allocate_deltas(temp_buff)
            temp_buff.allocate_buffers()
            ll.set_deltas(temp_buff)

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[8].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)

    start = 0
    for bb in (b1, b2, b3):
        xb = x
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out, neon_out_ref, rtol=0)

    neon_logger.display("Beginning Back prop")
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)
    for ll in reversed(neon_layer.layers[8:]):
        err = ll.bprop(err)

    neon_deltas = err.get()
    for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas

    neon_ref_deltas = ref_deltas.get()

    assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
Пример #10
0
def test_branch_model_fork_cpu(backend_cpu64):
    from neon.layers import BranchNode, Tree
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 32
    bnode = BranchNode()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top1 = top_branch()
    top2 = top_branch()
    p1 = Sequential(main_branch() + [bnode, i1] + top1)
    p2 = [bnode] + top2

    alpha2 = 0.3
    neon_layer = Tree([p1, p2], alphas=[1.0, alpha2])

    inshape = (4, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()

    neon_layer.layers[0].layers[0].prev_layer = True
    neon_layer.allocate_deltas()

    neon_out_dev = neon_layer.fprop(inp)
    neon_out = [d.get() for d in neon_out_dev]

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].deltas = be.iobuf(inshape)

    branch2 = Sequential(top_branch())
    lbranch2 = branch2.layers
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3, lbranch2):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[0].layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        temp_deltas = DeltasTree()
        temp_deltas.proc_layer(ll)
        temp_deltas.allocate_buffers()
        ll.set_deltas(temp_deltas)

    for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})

    for bb in (b1, b2, b3, lbranch2):
        for ll in bb:
            ll.allocate()
            temp_deltas = DeltasTree()
            temp_deltas.proc_layer(ll)
            temp_deltas.allocate_buffers()
            ll.set_deltas(temp_deltas)

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)
    main2_out = x

    start = 0
    for bb in (b1, b2, b3):
        xb = main2_out
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top1).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out_ref, neon_out[0], rtol=0)

    # Now do second branch
    neon_out_ref2 = branch2.fprop(main2_out).get()
    assert allclose_with_out(neon_out_ref2, neon_out[1])

    neon_logger.display("Beginning Back prop")
    erra = [np.random.random(d.shape) for d in neon_out]
    err = [be.array(d) for d in erra]
    neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape)
    neon_layer.bprop(err)

    bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get()
    middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get()

    err0 = err[0]
    for ll in reversed(top_trunk):
        err0 = ll.bprop(err0)

    err1 = err[1]
    for ll in reversed(lbranch2):
        err1 = ll.bprop(err1)

    for bb, errb in zip((b1, b2, b3), neon_layer.layers[0].layers[-5].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = alpha2 * lbranch2[0].deltas
    ref_deltas[:] = ref_deltas + b3[0].deltas + b2[0].deltas + b1[0].deltas
    neon_ref_deltas = ref_deltas.get()
    assert allclose_with_out(middle_neon_deltas, neon_ref_deltas, rtol=0)

    x = ref_deltas
    main2[0].deltas = be.iobuf(inshape)

    for ll in reversed(main2):
        x = ll.bprop(x)

    bottom_neon_ref_deltas = main2[1].deltas.get()
    assert allclose_with_out(bottom_neon_deltas, bottom_neon_ref_deltas, rtol=0)
Пример #11
0
def gradient_calc(seq_len, input_size, hidden_size, batch_size, add_init_state=False,
                  epsilon=None, rand_scale=None, inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon gru instance
    gru = GRU(hidden_size, init=Gaussian(), activation=Tanh(), gate_activation=Logistic())
    inpa = gru.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    gru.configure((input_size, seq_len))
    gru.prev_layer = True
    gru.allocate()

    test_buffer = DeltasTree()
    gru.allocate_deltas(test_buffer)
    test_buffer.allocate_buffers()
    gru.set_deltas(test_buffer)

    if add_init_state is True:
        slice_shape = (hidden_size, batch_size)
        ini_s = np.random.randn(*slice_shape)
        ini_s_dev = gru.be.array(ini_s.copy())
        out_bl = gru.fprop(inpa, ini_s_dev).get()
    else:
        out_bl = gru.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = gru.bprop(gru.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_gru(gru)
        gru.allocate()
        if add_init_state is True:
            ini_s_dev = gru.be.array(ini_s.copy())
            out_pos = gru.fprop(gru.be.array(inp_pert), ini_s_dev).get()
        else:
            out_pos = gru.fprop(gru.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_gru(gru)
        gru.allocate()
        if add_init_state is True:
            ini_s_dev = gru.be.array(ini_s.copy())
            out_neg = gru.fprop(gru.be.array(inp_pert), ini_s_dev).get()
        else:
            out_neg = gru.fprop(gru.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale * out_pos)
        loss_neg = np.sum(rand_scale * out_neg)
        # compute the gradient estimate
        grad = 0.5 / float(epsilon) * (loss_pos - loss_neg)

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del gru
    return (grads_est, deltas_neon)
Пример #12
0
def check_gru(seq_len, input_size, hidden_size,
              batch_size, init_func, inp_moms=[0.0, 1.0], add_init_state=False):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    slice_shape = (hidden_size, batch_size)

    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon GRU
    gru = GRU(hidden_size,
              init_func,
              activation=Tanh(),
              gate_activation=Logistic())

    # generate random input tensor
    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inp_dev = gru.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # run neon fprop
    gru.configure((input_size, seq_len))
    gru.prev_layer = True
    gru.allocate()

    test_buffer = DeltasTree()
    gru.allocate_deltas(test_buffer)
    test_buffer.allocate_buffers()
    gru.set_deltas(test_buffer)

    if add_init_state:
        init_state = np.random.rand(*slice_shape)*inp_moms[1] + inp_moms[0]
        init_state_dev = gru.be.array(init_state)
        gru.fprop(inp_dev, init_state=init_state_dev)
    else:
        gru.fprop(inp_dev)

    # reference numpy GRU
    gru_ref = RefGRU(input_size, hidden_size)
    WGRU = gru_ref.weights

    # make ref weights and biases the same with neon model
    r_range = list(range(hidden_size))
    z_range = list(range(hidden_size, hidden_size * 2))
    c_range = list(range(hidden_size * 2, hidden_size * 3))

    WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range]
    WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range]
    WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range]

    WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range]
    WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range]
    WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range]

    WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range]
    WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range]
    WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range]

    # transpose input X and do fprop
    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(
        seq_len, batch_size, input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(
        seq_len, batch_size, hidden_size).swapaxes(1, 2)

    if add_init_state:
        init_state_ref = init_state.copy()
        (dWGRU_ref, h_ref_list, dh_ref_list,
            dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref,
                                                                     deltas_ref,
                                                                     init_state_ref)
    else:
        (dWGRU_ref, h_ref_list, dh_ref_list,
            dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref,
                                                                     deltas_ref)

    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(gru.outputs.get(),
                             h_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('fprop is verified')

    # now test the bprop
    neon_logger.display('Making sure neon GRU matches numpy GRU in bprop')
    gru.bprop(gru.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = gru.dW_input.get()
    dWrecur_neon = gru.dW_recur.get()
    db_neon = gru.db.get()
    dWxr_neon = dWinput_neon[r_range]
    dWxz_neon = dWinput_neon[z_range]
    dWxc_neon = dWinput_neon[c_range]
    dWrr_neon = dWrecur_neon[r_range]
    dWrz_neon = dWrecur_neon[z_range]
    dWrc_neon = dWrecur_neon[c_range]
    dbr_neon = db_neon[r_range]
    dbz_neon = db_neon[z_range]
    dbc_neon = db_neon[c_range]

    drzc_neon = gru.rzhcan_delta_buffer.get()
    dr_neon = drzc_neon[r_range]
    dz_neon = drzc_neon[z_range]
    dc_neon = drzc_neon[c_range]

    dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr]
    dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz]
    dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc]
    dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr]
    dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz]
    dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc]
    dbr_ref = dWGRU_ref[gru_ref.dW_ind_br]
    dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz]
    dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc]

    # neon_logger.display '====Verifying hidden deltas ===='
    neon_logger.display('====Verifying r deltas ====')
    assert allclose_with_out(dr_neon,
                             dr_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('====Verifying z deltas ====')
    assert allclose_with_out(dz_neon,
                             dz_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('====Verifying hcan deltas ====')
    assert allclose_with_out(dc_neon,
                             dc_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('====Verifying update on W_input====')
    neon_logger.display('dWxr')
    assert allclose_with_out(dWxr_neon,
                             dWxr_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    neon_logger.display('dWxz')
    assert allclose_with_out(dWxz_neon,
                             dWxz_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    neon_logger.display('dWxc')
    assert allclose_with_out(dWxc_neon,
                             dWxc_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('====Verifying update on W_recur====')

    neon_logger.display('dWrr')
    assert allclose_with_out(dWrr_neon,
                             dWrr_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    neon_logger.display('dWrz')
    assert allclose_with_out(dWrz_neon,
                             dWrz_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    neon_logger.display('dWrc')
    assert allclose_with_out(dWrc_neon,
                             dWrc_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('====Verifying update on bias====')
    neon_logger.display('dbr')
    assert allclose_with_out(dbr_neon,
                             dbr_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    neon_logger.display('dbz')
    assert allclose_with_out(dbz_neon,
                             dbz_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    neon_logger.display('dbc')
    assert allclose_with_out(dbc_neon,
                             dbc_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('bprop is verified')

    return
Пример #13
0
def general_gradient_comp(layer,
                          inp,
                          epsilon=1.0e-5,
                          loss_scale=None,
                          lshape=None,
                          pert_inds=None,
                          pooling=False):
    # given a layer, test the bprop
    # using finite differences

    # run neon fprop
    layer.reset()
    inpa = layer.be.array(inp.copy())
    in_shape = lshape if lshape is not None else inpa.shape[0]
    layer.configure(in_shape)
    if layer.owns_delta:
        layer.prev_layer = True
    layer.allocate()

    dtree = DeltasTree()
    layer.allocate_deltas(dtree)
    dtree.allocate_buffers()
    layer.set_deltas(dtree)

    out = layer.fprop(inpa).get()

    out_shape = out.shape

    # scale out by random matrix...
    if loss_scale is None:
        loss_scale = np.random.random(out_shape) * 2.0 - 1.0

    # the loss function is:
    # loss_bl = np.sum(loss_scale * out)

    # run bprop, input deltas is rand_scale
    bprop_deltas = layer.bprop(layer.be.array(loss_scale.copy())).get()

    max_abs_err = -1.0
    max_rel_err = -1.0

    inp_pert = inp.copy()
    if pert_inds is None:
        pert_inds = list(range(inp.size))
    for pert_ind in pert_inds:
        save_val = inp_pert.flat[pert_ind]
        # add/subtract perturbations to input
        inp_pert.flat[pert_ind] = save_val + epsilon
        # and run fprop on perturbed input
        layer.reset()
        layer.configure(in_shape)
        layer.allocate()
        inpa = layer.be.array(inp_pert.copy())
        out_pos = layer.fprop(inpa).get().copy()

        inp_pert.flat[pert_ind] = save_val - epsilon
        inpa = layer.be.array(inp_pert.copy())
        layer.reset()
        layer.configure(in_shape)
        layer.allocate()
        out_neg = layer.fprop(inpa).get().copy()

        # calculate the loss on outputs
        loss_pos = np.sum(loss_scale * out_pos)
        loss_neg = np.sum(loss_scale * out_neg)
        grad_est = 0.5 * (loss_pos - loss_neg) / epsilon

        # reset input
        inp_pert.flat[pert_ind] = save_val

        bprop_val = bprop_deltas.flat[pert_ind]

        abs_err = abs(grad_est - bprop_val)
        if abs_err > max_abs_err:
            max_abs_err = abs_err
            max_abs_vals = [grad_est, bprop_val]

        if (abs(grad_est) + abs(bprop_val)) == 0.0:
            rel_err = 0.0
        else:
            rel_err = float(abs_err) / (abs(grad_est) + abs(bprop_val))
        if rel_err > max_rel_err:
            max_rel_err = rel_err
            max_rel_vals = [grad_est, bprop_val]

    neon_logger.display('Worst case diff %e, vals grad: %e, bprop: %e' %
                        (max_abs_err, max_abs_vals[0], max_abs_vals[1]))
    neon_logger.display('Worst case diff %e, vals grad: %e, bprop: %e' %
                        (max_rel_err, max_rel_vals[0], max_rel_vals[1]))
    return (max_abs_err, max_rel_err)
Пример #14
0
def check_rnn(seq_len, input_size, hidden_size,
              batch_size, init_func, inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # ======== create models ========
    # neon RNN
    rnn = Recurrent(hidden_size, init_func, activation=Tanh())

    # reference numpy RNN
    rnn_ref = RefRecurrent(input_size, hidden_size)
    Wxh = rnn_ref.Wxh
    Whh = rnn_ref.Whh
    bh = rnn_ref.bh

    # ========= generate data =================
    # generate random input tensor
    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = rnn.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(
        seq_len, batch_size, input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(
        seq_len, batch_size, hidden_size).swapaxes(1, 2)

    # ========= running models ==========
    # run neon fprop
    rnn.configure((input_size, seq_len))
    rnn.prev_layer = True
    rnn.allocate()

    dtree = DeltasTree()
    rnn.allocate_deltas(dtree)
    dtree.allocate_buffers()
    rnn.set_deltas(dtree)

    rnn.fprop(inpa)

    # weights are only initialized after doing fprop, so now
    # make ref weights and biases the same with neon model
    Wxh[:] = rnn.W_input.get()
    Whh[:] = rnn.W_recur.get()
    bh[:] = rnn.b.get()

    (dWxh_ref, dWhh_ref, db_ref, h_ref_list,
     dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref)

    # now test the bprop
    rnn.bprop(rnn.be.array(deltas))
    # grab the delta W from gradient buffer
    dWxh_neon = rnn.dW_input.get()
    dWhh_neon = rnn.dW_recur.get()
    db_neon = rnn.db.get()

    # comparing outputs
    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(rnn.outputs.get(),
                             h_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('fprop is verified')

    neon_logger.display('====Verifying update on W and b ====')
    neon_logger.display('dWxh')
    assert allclose_with_out(dWxh_neon,
                             dWxh_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    neon_logger.display('dWhh')
    assert allclose_with_out(dWhh_neon,
                             dWhh_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('====Verifying update on bias====')
    neon_logger.display('db')
    assert allclose_with_out(db_neon,
                             db_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('bprop is verified')

    return
Пример #15
0
def deltas_buffer_wref():
    # returns 2 empty DeltasTree object for
    # tests that need to allocate shared
    # deltas buffers for 2 models
    # (one test, one reference)
    return (DeltasTree(), DeltasTree())
Пример #16
0
def gradient_calc(seq_len,
                  input_size,
                  hidden_size,
                  batch_size,
                  epsilon=None,
                  rand_scale=None,
                  inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon rnn instance
    rnn = Recurrent(hidden_size, Gaussian(), activation=Tanh())
    inpa = rnn.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    rnn.configure((input_size, seq_len))
    rnn.prev_layer = True
    rnn.allocate()

    dtree = DeltasTree()
    rnn.allocate_deltas(dtree)
    dtree.allocate_buffers()
    rnn.set_deltas(dtree)

    out_bl = rnn.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = rnn.bprop(rnn.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_rnn(rnn)
        rnn.allocate()
        out_pos = rnn.fprop(rnn.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_rnn(rnn)
        rnn.allocate()
        out_neg = rnn.fprop(rnn.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale * out_pos)
        loss_neg = np.sum(rand_scale * out_neg)
        # compute the gradient estimate
        grad = 0.5 * (loss_pos - loss_neg) / epsilon

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del rnn
    return (grads_est, deltas_neon)
Пример #17
0
class Model(NervanaObject):
    """
    Class which stores a list of layers describing the model. Can train the layer
    weights on a dataset, evaluate on a test set and serialize the model.
    Additional functionality can be added to the fit method through callback functions.

    Arguments:
        layers: layer container, a list of layers (that will be containerized),
                or a serialized model description
        dataset (NervanaDataIterator): Data set (ignored, will be removed)
        weights_only (bool): set to True if you do not want to recreate layers
                             and states during deserialization from a serialized model
                             description.  Defaults to False.
        name (str): Model name.  Defaults to "model"
        optimizer (Optimizer): Optimizer object which defines the learning rule for updating
                               model parameters (i.e., GradientDescentMomentum, Adadelta)
    """

    def __init__(self, layers, dataset=None, weights_only=False, name="model", optimizer=None):
        super(Model, self).__init__(name)
        self.optimizer = optimizer
        self.params = None  # should be able to remove
        self.states = None  # should be able to remove
        self.epoch_index = 0
        self.finished = False
        self.initialized = False
        self.cost = None
        self.nbatches = 0
        self.ndata = 0

        if dataset is not None:
            logger.warning('dataset is a deprecated argument and will be ignored')

        if type(layers) in (ModelDescription, dict):
            # load up the model from a serialized file (dataset could be None here)
            self.deserialize(layers, load_states=(not weights_only))
        elif isinstance(layers, (str, bytes)):
            self.load_params(layers, load_states=(not weights_only))
        else:
            # Wrap the list of layers in a Sequential container if a raw list of layers
            if type(layers) in (Sequential, Tree, SingleOutputTree, Seq2Seq):
                self.layers = layers
            elif type(layers) == SkipThought:
                self.layers = layers
                if hasattr(layers, 'layer_dict'):
                    self.layer_dict = layers.layer_dict
            else:
                self.layers = Sequential(layers)
        self.layers.propagate_parallelism("Data")

    @property
    def layers_to_optimize(self):
        """
        Helper function to return the layers which will be optimized.
        """
        return self.layers.layers_to_optimize

    def set_shortcut(self):
        # infer whether bprop shortcut can be used on final activation
        # self.cost should be set to run this otherwise do nothing
        lastlayer = self.layers[-1]
        try:
            if self.cost.costfunc.__class__ is CrossEntropyBinary:
                if (lastlayer.__class__ is Activation and
                   lastlayer.transform.__class__ is Logistic):
                    lastlayer.transform.set_shortcut(True)
        except (SystemExit, KeyboardInterrupt):
            raise
        except:
            # if any attributes are not set or any other exception
            # is thrown leave transform.shortcut as is (do nothing)
            pass

    def initialize(self, dataset, cost=None):
        """
        Propagate shapes through the layers to configure, then allocate space.

        Arguments:
            dataset (NervanaDataIterator): Dataset iterator to perform initialization on
            cost (Cost): Defines the function which the model is minimizing based
                         on the output of the last layer and the input labels.
        """
        if self.initialized:
            return

        # Propagate shapes through the layers to configure
        prev_input = dataset
        prev_input = self.layers.configure(prev_input)

        if cost is not None:
            cost.initialize(prev_input)
            self.cost = cost

        # Now allocate space
        self.layers.allocate()
        self.layers.allocate_deltas()
        self.initialized = True

    def allocate_deltas(self):
        if getattr(self, 'global_deltas', None) is None:
            self.global_deltas = DeltasTree()
            self.layers.allocate_deltas(self.global_deltas)

            # allocate the buffers now that all the
            # nesting and max sizes have been determined
            self.global_deltas.allocate_buffers(self.be)

        # set the deltas
        self.layers.set_deltas(self.global_deltas)

    def __str__(self):
        """
        String representation of model's layers
        """
        config_string = "Network Layers:\n" + self.layers.nested_str()
        return config_string

    def fit(self, dataset, cost, optimizer, num_epochs, callbacks):
        """
        Trains the model parameters on a dataset by minimizing the cost function through
        gradient descent and updates the layer weights according to a learning rule
        defined in optimizer.

        Arguments:
            dataset (NervanaDataIterator): An iterable of minibatches where each
                element is a (x, y) tuple where x is the input data and y are the labels.
                x is of dimension (feature_size, batch_size)
                y is of dimension (label_size, batch_size)
                Length of the iterator is num_batches which is num_data / batch_size.
            cost (Cost): Defines the function which the model is minimizing based
                         on the output of the last layer and the input labels.
            optimizer (Optimizer): Defines the learning rule for updating the model parameters.
            num_epochs: Number of times to iterate over the dataset.
            callbacks (Callbacks): Defines callbacks to run at the end of each mini-batch / epoch.
        """
        self.nbatches = dataset.nbatches
        self.ndata = dataset.ndata
        # self.set_shortcut()  # infer if bprop shortcut can be used
        self.total_cost = np.empty([1, 1], dtype=np.float32)
        self.optimizer = optimizer
        self.initialize(dataset, cost)

        callbacks.on_train_begin(num_epochs)
        while self.epoch_index < num_epochs and not self.finished:
            self.nbatches = dataset.nbatches

            callbacks.on_epoch_begin(self.epoch_index)

            self._epoch_fit(dataset, callbacks)

            callbacks.on_epoch_end(self.epoch_index)

            self.epoch_index += 1

        callbacks.on_train_end()

    def _epoch_fit(self, dataset, callbacks):
        """
        Helper function for fit which performs training on a dataset for one epoch.

        Arguments:
            dataset (NervanaDataIterator): Dataset iterator to perform fit on
        """
        epoch = self.epoch_index
        self.total_cost[:] = 0
        # iterate through minibatches of the dataset
        for mb_idx, (x, t) in enumerate(dataset):
            callbacks.on_minibatch_begin(epoch, mb_idx)
            self.be.begin(Block.minibatch, mb_idx)

            x = self.fprop(x)

            self.total_cost[:] = self.total_cost + self.cost.get_cost(x, t)

            # deltas back propagate through layers
            # for every layer in reverse except the 0th one
            delta = self.cost.get_errors(x, t)

            self.bprop(delta)
            self.optimizer.optimize(self.layers_to_optimize, epoch=epoch)

            self.be.end(Block.minibatch, mb_idx)
            callbacks.on_minibatch_end(epoch, mb_idx)

        # now we divide total cost by the number of batches,
        # so it was never total cost, but sum of averages
        # across all the minibatches we trained on
        self.total_cost[:] = self.total_cost / dataset.nbatches

    def fprop(self, x, inference=False):
        """
        Forward propagates a minibatch x through the model.

        Arguments:
            x (Tensor): Input minibatch data.
            inference (bool): Flag for performing training or inference
                Only affects batch norm and dropout layers.

        Returns:
            Tensor: the output of the final layer in the model
        """
        return self.layers.fprop(x, inference)

    def bprop(self, delta):
        """
        Back propagates the error of a minibatch through the model.

        Arguments:
            delta (Tensor): Derivative of cost with respect to the last layer's output

        Returns:
            Tensor: Deltas to propagate to the next layer
        """
        return self.layers.bprop(delta)

    def eval(self, dataset, metric):
        """
        Evaluates a model on a dataset according to an input metric.

        Arguments:
            datasets (NervanaDataIterator): dataset to evaluate on.
            metric (Cost): what function to evaluate dataset on.

        Returns:
            Host numpy array: the error of the final layer for the evaluation dataset
        """
        self.initialize(dataset)
        running_error = np.zeros((len(metric.metric_names)), dtype=np.float32)
        nprocessed = 0
        dataset.reset()
        if hasattr(dataset, 'seq_length'):
            ndata = dataset.ndata*dataset.seq_length
        else:
            ndata = dataset.ndata
        for x, t in dataset:
            x = self.fprop(x, inference=True)

            # This logic is for handling partial batch sizes at the end of the dataset
            nsteps = x.shape[1] // self.be.bsz if not isinstance(x, list) else \
                x[0].shape[1] // self.be.bsz

            bsz = min(ndata - nprocessed, self.be.bsz)
            running_error += metric(x, t, calcrange=slice(0, nsteps * bsz)) * nsteps * bsz
            nprocessed += bsz * nsteps
        running_error /= nprocessed
        return running_error

    def get_outputs(self, dataset):
        """
        Get the activation outputs of the final model layer for the dataset

        Arguments:
            dataset (NervanaDataIterator): Dataset iterator to perform fit on

        Returns:
            Host numpy array: the output of the final layer for the entire Dataset
        """
        self.initialize(dataset)
        dataset.reset()  # Move "pointer" back to beginning of dataset
        n = dataset.nbatches
        x = self.layers.layers[-1].outputs
        assert not isinstance(x, list), "Can not get_outputs with Branch terminal"
        Ypred = None
        for idx, input_data in enumerate(dataset):
            x = self.fprop(input_data[0], inference=True)
            if Ypred is None:
                (dim0, dim1) = x.shape
                Ypred = np.empty((n * dim1, dim0), dtype=x.dtype)
                nsteps = dim1 // self.be.bsz
            cur_batch = slice(idx * dim1, (idx + 1) * dim1)
            Ypred[cur_batch] = x.get().T

        # Handle the recurrent case.
        if nsteps != 1:
            b, s = (self.be.bsz, nsteps)
            Ypred = Ypred.reshape((n, s, b, -1)).transpose(0, 2, 1, 3).copy().reshape(n * b, s, -1)

        return Ypred[:dataset.ndata]

    def get_outputs_beam(self, dataset, num_beams=0, steps=None):
        """
        Get the activation outputs of the final model layer for the dataset

        Arguments:
            dataset (NervanaDataIterator) Dataset iterator to perform fit on
            num_beams (int, optional) Nonzero to use beamsearch for sequence to sequence models
            steps (Int): Length of desired output in number of time steps

        Returns:
            Host numpy array: the output of the final layer for the entire Dataset
        """
        self.initialize(dataset)
        dataset.reset()  # Move "pointer" back to beginning of dataset
        n = dataset.nbatches
        x = self.layers.layers[-1].outputs
        assert not isinstance(x, list), "Can not get_outputs with Branch terminal"
        if num_beams > 0:
            beamsearch = BeamSearch(self.layers)
        Ypred = None
        logger.info('Performing beam search with ' + str(num_beams) + ' beams')
        for idx, (x, t) in enumerate(dataset):
            if num_beams > 0:
                x = beamsearch.beamsearch(x, num_beams, steps=steps)
            else:
                x = self.fprop(x, inference=True)
            if Ypred is None:
                (dim0, dim1) = x.shape
                Ypred = np.empty((n * dim1, dim0), dtype=x.dtype)
                nsteps = dim1 // self.be.bsz
            cur_batch = slice(idx * dim1, (idx + 1) * dim1)
            Ypred[cur_batch] = x.get().T

        # Handle the beam search case.
        if dim0 == getattr(dataset, 'seq_length', None):
            b = self.be.bsz
            s = dataset.seq_length
            Ypred = Ypred.reshape((n*b, s, -1))

        # Handle the recurrent case.
        elif nsteps != 1:
            b, s = (self.be.bsz, nsteps)
            Ypred = Ypred.reshape((n, s, b, -1)).transpose(0, 2, 1, 3).copy().reshape(n * b, s, -1)

        return Ypred[:dataset.ndata]

    def get_description(self, get_weights=False, keep_states=False):
        """
        Gets a description of the model required to reconstruct the model with
        no weights like from a yaml file.

        Arguments:
            get_weights:  (Default value = False)
            keep_states:  (Default value = False)

        Returns:
            dict: Description of each component of the model.
        """
        pdict = dict()
        pdict['neon_version'] = __neon_version__
        compat_mode = self.be.compat_mode if self.be.compat_mode is not None else 'neon'
        pdict['backend'] = {'type': self.be.__class__.__name__,
                            'compat_mode': compat_mode,
                            'rng_seed': self.be.rng_seed,
                            'rng_state': self.be.rng_get_state()}

        if self.cost:
            pdict['cost'] = self.cost.get_description()
        if self.optimizer:
            pdict['optimizer'] = self.optimizer.get_description()

        pdict['model'] = self.layers.get_description(get_weights=get_weights,
                                                     keep_states=keep_states)
        return pdict

    def save_params(self, param_path, keep_states=True):
        """
        Serializes and saves model parameters to the path specified.

        Arguments:
            param_path (str): File to write serialized parameter dict to.
            keep_states (bool): Whether to save optimizer states too.
                                Defaults to True.
        """
        self.serialize(keep_states=keep_states, fn=param_path)

    def load_params(self, param_path, load_states=True):
        """
        Loads the model parameters (per layer weights, epochs run, optimizer
        states) saved in param_path from serialize().

        Arguments:
            param_path (str): File containing serialized python dict with layer
                              weights and states.
            load_states (bool):  if False, then only the weights will be loaded
                                 into a model in which the layers have already been
                                 created, otherwise will (re)create the layers from
                                 the serialized parameters and set the learning
                                 states as well
        """
        self.deserialize(load_obj(param_path), load_states=load_states)
        logger.info('Model weights loaded from %s', param_path)

    def load_weights(self, weight_path):
        """
        .. deprecated:: 1.1.4
           Use :func:`load_params` instead

        Arguments:
            weight_path:
        """
        logger.warning('Calling deprecated load_weights function.  Use '
                       'load_params instead')
        self.load_params(weight_path)

    def deserialize(self, model_dict, data=None, load_states=True):
        """
        Loads per layer (weights, states) and other model parameters from the
        dictionary passed.

        Arguments:
            model_dict (dict): dictionary describing the model including layers,
                               cost, optimizers, backend settings, etc.
                               generated by the serialize function
            data (NervanaDataIterator):   Data set (ignored, will be removed)
            load_states (bool):  if False, then only the weights will be loaded
                                 into a model in which the layers have already been
                                 created, otherwise will (re)create the layers from
                                 the serialized parameters and set the learning
                                 states as well
        """

        if data is not None:
            logger.warning('data is a deprecated argument and will be ignored')

        if 'epoch_index' in model_dict:
            self.epoch_index = model_dict['epoch_index']
        if 'model' not in model_dict:
            logger.error('Using old model serialization format. '
                         'Serialized the model into new format')

            param_layers = [l for l in self.layers_to_optimize]
            param_dict_list = model_dict['layer_params_states']
            for l, ps in zip(param_layers, param_dict_list):
                l.set_params(ps)
                if 'states' in ps and load_states:
                    l.set_states(ps)
            return

        if 'backend' in model_dict:
            if 'compat_mode' in model_dict['backend']:
                self.be.compat_mode = model_dict['backend']['compat_mode']
        else:
            model_dict['backend'] = {}

        typ = model_dict['model']['type']
        main_container = load_class(typ)

        if not hasattr(self, 'layers'):
            self.layers = main_container.gen_class(model_dict['model']['config'])

        self.layers.load_weights(model_dict['model'], load_states)

        if load_states and 'rng_state' in model_dict['backend']:
            try:
                self.be.rng_set_state(model_dict['backend']['rng_state'])
            except ValueError as e:
                # could come about when switching backend types (ex GPU to CPU)
                logger.warning("Problems restoring existing RNG state: %s", str(e))

    # serialize tells how to write out the parameters we've learned so
    # far and associate them with layers. it can ignore layers with no
    # learned parameters. the model stores states to pass to the
    # optimizers.  if we're saving the model out for inference, we
    # don't need to remember states.
    def serialize(self, fn=None, keep_states=True):
        """
        Creates a dictionary storing the layer parameters and epochs complete.

        Arguments:
            fn (str): file to save pkl formatted model dictionary
            keep_states (bool): Whether to save optimizer states.

        Returns:
            dict: Model data including layer parameters and epochs complete.
        """

        # get the model dict with the weights
        pdict = self.get_description(get_weights=True, keep_states=keep_states)
        pdict['epoch_index'] = self.epoch_index + 1
        if self.initialized:
            if not hasattr(self.layers, 'decoder'):
                pdict['train_input_shape'] = self.layers.in_shape
            else:
                # serialize shapes both for encoder and decoder
                pdict['train_input_shape'] = (self.layers.encoder.in_shape +
                                              self.layers.decoder.in_shape)
        if fn is not None:
            save_obj(pdict, fn)
            return
        return pdict

    def set_batch_size(self, N):
        """
        Set the actual minibatch size, so even though the buffers are allocated considering
        excessive padding, the processing for some layers may be shortened.
        Currently most of the neon layers don't use that to control the processing. The
        interface is here only for when someone wants to set that information and experiment.

        Arguments:
            N:

        Returns:

        """
        return self.layers.set_batch_size(N)

    def set_seq_len(self, S):
        """
        Set the actual minibatch sequence length, so even though the buffers are allocated
        considering excessive padding, the processing for some layers may be shortened.
        Currently most of the neon layers don't use that to control the processing. The
        interface is here only for when someone wants to set that information and experiment.

        Arguments:
            S:

        Returns:

        """
        return self.layers.set_seq_len(S)

    def benchmark(self, dataset, inference=False, cost=None, optimizer=None,
                  niterations=20, nskip=2):
        """
        Measure runtime for computing fprop and bprop separately, as well as
        full minibatch run times. For inference case, only the fprop is measured.

        Arguments:
             dataset (NervanaDataIterator) Dataset iterator to perform fit on

             cost (Cost): Defines the function which the model is minimizing based
                          on the output of the last layer and the input labels

             niterations (optional, int): Number of minibatches to average over

             nskip (optional, int): Number of iterations at the beginning to skip
                                    when calculating the runtime statistics
             inference (bool, optional): Is inference use case
             optimizer (Optimizer): Defines the learning rule for updating the model parameters.
        Returns:
            dictionary with fprop, bprop run times
        """
        # initialize model
        if inference is False and (cost is None or optimizer is None):
            raise RuntimeError("Need cost and optimizer to benchmark bprop")

        self.cost = cost
        self.initialize(dataset, cost)
        self.optimizer = optimizer
        self.total_cost = np.empty((1, 1))
        self.total_cost[:] = 0

        # iterate through minibatches of the dataset
        times = OrderedDict()
        time_keys = ['fprop'] if inference else ['fprop', 'bprop', 'iteration']
        for ky in time_keys:
            times[ky] = np.full(niterations + nskip, -1.0)
        count = 0

        fprop_start = self.be.init_mark()
        fprop_end = self.be.init_mark()
        bprop_end = self.be.init_mark()

        while count < niterations + nskip:
            dataset.reset()
            for mb_idx, (x, t) in enumerate(dataset):

                self.be.record_mark(fprop_start)  # mark start of fprop

                x = self.fprop(x, inference)

                if inference is False:
                    self.total_cost[:] = self.total_cost + self.cost.get_cost(x, t)

                self.be.record_mark(fprop_end)  # mark end of fprop and start of bprop

                if inference is False:
                    delta = self.cost.get_errors(x, t)
                    self.bprop(delta)
                    self.optimizer.optimize(self.layers_to_optimize, epoch=0)

                    self.be.record_mark(bprop_end)  # mark end of bprop
                    self.be.synchronize_mark(bprop_end)
                else:
                    self.be.synchronize_mark(fprop_end)

                times['fprop'][count] = self.be.get_time(fprop_start, fprop_end)
                if inference is False:
                    times['bprop'][count] = self.be.get_time(fprop_end, bprop_end)
                    times['iteration'][count] = times['fprop'][count] + times['bprop'][count]

                count += 1
                if count >= niterations + nskip:
                    break

        # print results
        header = ('Func', 'Mean', 'Median', 'Min', 'Max', 'Units')
        stats = tuple(stat.lower() for stat in header[1:-1])

        fmt_titles = '| {:^11} ' * len(header) + '|'
        fmt_nums = '| {func:<11} ' + '|  {%s:<10.5g} ' * len(stats) % (stats) + '| {units:^11} |'

        head_str = fmt_titles.format(*header)
        sep = '-' * len(head_str)
        neon_logger.display(sep)
        neon_logger.display(head_str)
        neon_logger.display(sep)
        out_stats = {}
        for step in times:
            timesu = np.array(times[step][nskip:])  # in ms
            out_stats[step] = {}
            for stat in stats:
                out_stats[step][stat] = getattr(np, stat)(timesu)
            neon_logger.display(fmt_nums.format(units='msec', func=step, **out_stats[step]))
        neon_logger.display(sep)
        return out_stats
Пример #18
0
def general_gradient_comp(layer,
                          inp,
                          epsilon=1.0e-5,
                          loss_scale=None,
                          lshape=None,
                          pert_inds=None,
                          pooling=False):
    # given a layer, test the bprop
    # using finite differences

    # run neon fprop
    layer.reset()
    inpa = layer.be.array(inp.copy())
    in_shape = lshape if lshape is not None else inpa.shape[0]
    layer.configure(in_shape)
    if layer.owns_delta:
        layer.prev_layer = True
    layer.allocate()

    dtree = DeltasTree()
    layer.allocate_deltas(dtree)
    dtree.allocate_buffers()
    layer.set_deltas(dtree)

    out = layer.fprop(inpa).get()

    out_shape = out.shape

    # scale out by random matrix...
    if loss_scale is None:
        loss_scale = np.random.random(out_shape) * 2.0 - 1.0

    # the loss function is:
    # loss_bl = np.sum(loss_scale * out)

    # run bprop, input deltas is rand_scale
    bprop_deltas = layer.bprop(layer.be.array(loss_scale.copy())).get()

    max_abs_err = -1.0
    max_rel_err = -1.0

    inp_pert = inp.copy()
    if pert_inds is None:
        pert_inds = list(range(inp.size))
    for pert_ind in pert_inds:
        save_val = inp_pert.flat[pert_ind]
        # add/subtract perturbations to input
        inp_pert.flat[pert_ind] = save_val + epsilon
        # and run fprop on perturbed input
        layer.reset()
        layer.configure(in_shape)
        layer.allocate()
        inpa = layer.be.array(inp_pert.copy())
        out_pos = layer.fprop(inpa).get().copy()

        inp_pert.flat[pert_ind] = save_val - epsilon
        inpa = layer.be.array(inp_pert.copy())
        layer.reset()
        layer.configure(in_shape)
        layer.allocate()
        out_neg = layer.fprop(inpa).get().copy()

        # calculate the loss on outputs
        loss_pos = np.sum(loss_scale * out_pos)
        loss_neg = np.sum(loss_scale * out_neg)
        grad_est = 0.5 * (loss_pos - loss_neg) / epsilon

        # reset input
        inp_pert.flat[pert_ind] = save_val

        bprop_val = bprop_deltas.flat[pert_ind]

        abs_err = abs(grad_est - bprop_val)
        if abs_err > max_abs_err:
            max_abs_err = abs_err
            max_abs_vals = [grad_est, bprop_val]

        if (abs(grad_est) + abs(bprop_val)) == 0.0:
            rel_err = 0.0
        else:
            rel_err = float(abs_err) / (abs(grad_est) + abs(bprop_val))
        if rel_err > max_rel_err:
            max_rel_err = rel_err
            max_rel_vals = [grad_est, bprop_val]

    neon_logger.display('Worst case diff %e, vals grad: %e, bprop: %e' % (max_abs_err,
                                                                          max_abs_vals[0],
                                                                          max_abs_vals[1]))
    neon_logger.display('Worst case diff %e, vals grad: %e, bprop: %e' % (max_rel_err,
                                                                          max_rel_vals[0],
                                                                          max_rel_vals[1]))
    return (max_abs_err, max_rel_err)
Пример #19
0
def check_lstm(seq_len, input_size, hidden_size,
               batch_size, init_func, inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    hidden_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon LSTM
    lstm = LSTM(hidden_size,
                init_func,
                activation=Tanh(),
                gate_activation=Logistic())

    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = lstm.be.array(inp)
    # run neon fprop
    lstm.configure((input_size, seq_len))
    lstm.prev_layer = True  # Hack to force allocating a delta buffer
    lstm.allocate()

    dtree = DeltasTree()
    lstm.allocate_deltas(dtree)
    dtree.allocate_buffers()
    lstm.set_deltas(dtree)

    lstm.fprop(inpa)

    # reference numpy LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size)

    # make ref weights and biases with neon model
    WLSTM[0, :] = lstm.b.get().T
    WLSTM[1:input_size + 1, :] = lstm.W_input.get().T
    WLSTM[input_size + 1:] = lstm.W_recur.get().T

    # transpose input X and do fprop
    inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size)
    (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref,
                                                             WLSTM)

    # the output needs transpose as well
    Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T
    IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T
    Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T

    # compare results
    neon_logger.display('====Verifying IFOG====')
    assert allclose_with_out(lstm.ifog_buffer.get(),
                             IFOGf_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying cell states====')
    assert allclose_with_out(lstm.c_act_buffer.get(),
                             Ct_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(lstm.outputs.get(),
                             Hout_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('fprop is verified')

    # now test the bprop
    # generate random deltas tensor
    deltas = np.random.randn(*hidden_shape)

    lstm.bprop(lstm.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = lstm.dW_input.get()
    dWrecur_neon = lstm.dW_recur.get()
    db_neon = lstm.db.get()

    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size)
    (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref,
                                                               batch_cache)
    dWrecur_ref = dWLSTM_ref[-hidden_size:, :]
    dWinput_ref = dWLSTM_ref[1:input_size + 1, :]
    db_ref = dWLSTM_ref[0, :]
    dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T

    # compare results
    neon_logger.display('Making sure neon LSTM match numpy LSTM in bprop')
    neon_logger.display('====Verifying update on W_recur====')

    assert allclose_with_out(dWrecur_neon,
                             dWrecur_ref.T,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying update on W_input====')
    assert allclose_with_out(dWinput_neon,
                             dWinput_ref.T,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying update on bias====')
    assert allclose_with_out(db_neon.flatten(),
                             db_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying output delta====')
    assert allclose_with_out(lstm.out_deltas_buffer.get(),
                             dX_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('bprop is verified')

    return