示例#1
0
def test_biLSTM_bprop(backend_default, fargs):

    # basic sanity check with 0 weights random inputs
    seq_len, input_size, hidden_size, batch_size = fargs
    in_shape = (input_size, seq_len)
    out_shape = (hidden_size, seq_len)
    NervanaObject.be.bsz = batch_size

    # setup the bi-directional rnn
    init_glorot = GlorotUniform()
    bilstm = BiLSTM(hidden_size, gate_activation=Logistic(),
                    activation=Tanh(), init=init_glorot, reset_cells=True)
    bilstm.configure(in_shape)
    bilstm.prev_layer = True
    bilstm.allocate()
    bilstm.set_deltas([bilstm.be.iobuf(bilstm.in_shape)])

    # same weight for bi-rnn backward and rnn weights
    nout = hidden_size
    bilstm.W_input_b[:] = bilstm.W_input_f
    bilstm.W_recur_b[:] = bilstm.W_recur_f
    bilstm.b_b[:] = bilstm.b_f
    bilstm.dW[:] = 0

    # inputs and views
    lr = np.random.random((input_size, seq_len * batch_size))
    lr_rev = list(reversed(get_steps(lr.copy(), in_shape)))
    rl = con(lr_rev, axis=1)

    # allocate gpu buffers
    inp_lr = bilstm.be.array(lr)
    inp_rl = bilstm.be.array(rl)

    # outputs
    out_lr_g = bilstm.fprop(inp_lr)
    out_lr = out_lr_g.get().copy()
    del_lr = bilstm.bprop(out_lr_g).get().copy()
    bilstm.h_buffer[:] = 0
    out_rl_g = bilstm.fprop(inp_rl)
    out_rl = out_rl_g.get().copy()
    del_rl = bilstm.bprop(out_rl_g).get().copy()

    # views
    out_lr_f_s = get_steps(out_lr[:nout], out_shape)
    out_lr_b_s = get_steps(out_lr[nout:], out_shape)
    out_rl_f_s = get_steps(out_rl[:nout], out_shape)
    out_rl_b_s = get_steps(out_rl[nout:], out_shape)

    # asserts
    for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s,
                                  reversed(out_rl_f_s), reversed(out_rl_b_s)):
        assert np.allclose(x_f, y_b, rtol=0.0, atol=1.0e-5)
        assert np.allclose(x_b, y_f, rtol=0.0, atol=1.0e-5)

    del_lr_s = get_steps(del_lr, in_shape)
    del_rl_s = get_steps(del_rl, in_shape)

    for (x, y) in zip(del_lr_s, reversed(del_rl_s)):
        assert np.allclose(x, y, rtol=0.0, atol=1.0e-5)
示例#2
0
def test_tanh_derivative(backend):
    inputs = np.array([true_tanh(0), true_tanh(1),
                       true_tanh(-2)]).reshape((3, 1))
    # bprop is on the output
    outputs = np.array(
        [1 - true_tanh(0)**2, 1 - true_tanh(1)**2,
         1 - true_tanh(-2)**2]).reshape((3, 1))
    compare_tensors(Tanh(), inputs, outputs, deriv=True, tol=1e-7)
示例#3
0
def gradient_calc(seq_len, input_size, hidden_size, batch_size,
                  epsilon=None, rand_scale=None, inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon gru instance
    gru = GRU(hidden_size, Gaussian(), Tanh(), Logistic())
    inpa = gru.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    gru.configure((input_size, seq_len))
    gru.allocate()
    out_bl = gru.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = gru.bprop(gru.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_gru(gru)
        gru.allocate()
        out_pos = gru.fprop(gru.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_gru(gru)
        gru.allocate()
        out_neg = gru.fprop(gru.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale*out_pos)
        loss_neg = np.sum(rand_scale*out_neg)
        # compute the gradient estimate
        grad = 0.5*(loss_pos-loss_neg)/epsilon

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del gru
    return (grads_est, deltas_neon)
示例#4
0
def create_model(vocab_size, rlayer_type):
    """
    Create LSTM/GRU model for bAbI dataset.

    Args:
        vocab_size (int) : String of bAbI data.
        rlayer_type (string) : Type of recurrent layer to use (gru or lstm).

    Returns:
        Model : Model of the created network
    """
    # recurrent layer parameters (default gru)
    rlayer_obj = GRU if rlayer_type == 'gru' else LSTM
    rlayer_params = dict(output_size=100,
                         reset_cells=True,
                         init=GlorotUniform(),
                         init_inner=Orthonormal(0.5),
                         activation=Tanh(),
                         gate_activation=Logistic())

    # if using lstm, swap the activation functions
    if rlayer_type == 'lstm':
        rlayer_params.update(
            dict(activation=Logistic(), gate_activation=Tanh()))

    # lookup layer parameters
    lookup_params = dict(vocab_size=vocab_size,
                         embedding_dim=50,
                         init=Uniform(-0.05, 0.05))

    # Model construction
    story_path = [LookupTable(**lookup_params), rlayer_obj(**rlayer_params)]
    query_path = [LookupTable(**lookup_params), rlayer_obj(**rlayer_params)]

    layers = [
        MergeMultistream(layers=[story_path, query_path], merge="stack"),
        Affine(vocab_size, init=GlorotUniform(), activation=Softmax())
    ]

    return Model(layers=layers)
示例#5
0
def test_biLSTM_fprop(backend_default, fargs):

    # basic sanity check with 0 weights random inputs
    seq_len, input_size, hidden_size, batch_size = fargs
    in_shape = (input_size, seq_len)
    out_shape = (hidden_size, seq_len)
    NervanaObject.be.bsz = batch_size

    # setup the bi-directional rnn
    init_glorot = GlorotUniform()
    bilstm = BiLSTM(hidden_size,
                    gate_activation=Logistic(),
                    init=init_glorot,
                    activation=Tanh(),
                    reset_cells=True)
    bilstm.configure(in_shape)
    bilstm.prev_layer = True
    bilstm.allocate()

    # same weight
    nout = hidden_size
    bilstm.W_input_b[:] = bilstm.W_input_f
    bilstm.W_recur_b[:] = bilstm.W_recur_f
    bilstm.b_b[:] = bilstm.b_f
    bilstm.dW[:] = 0

    # inputs - random and flipped left-to-right inputs
    lr = np.random.random((input_size, seq_len * batch_size))
    lr_rev = list(reversed(get_steps(lr.copy(), in_shape)))

    rl = con(lr_rev, axis=1)
    inp_lr = bilstm.be.array(lr)
    inp_rl = bilstm.be.array(rl)

    # outputs
    out_lr = bilstm.fprop(inp_lr).get().copy()

    bilstm.h_buffer[:] = 0
    out_rl = bilstm.fprop(inp_rl).get().copy()

    # views
    out_lr_f_s = get_steps(out_lr[:nout], out_shape)
    out_lr_b_s = get_steps(out_lr[nout:], out_shape)
    out_rl_f_s = get_steps(out_rl[:nout], out_shape)
    out_rl_b_s = get_steps(out_rl[nout:], out_shape)

    # asserts
    for x_f, x_b, y_f, y_b in zip(out_lr_f_s, out_lr_b_s, reversed(out_rl_f_s),
                                  reversed(out_rl_b_s)):
        assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5)
        assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5)
示例#6
0
def test_multi_optimizer(backend_default_mkl):
    """
    A test for MultiOptimizer.
    """
    opt_gdm = GradientDescentMomentum(
        learning_rate=0.001, momentum_coef=0.9, wdecay=0.005)
    opt_ada = Adadelta()
    opt_adam = Adam()
    opt_rms = RMSProp()
    opt_rms_1 = RMSProp(gradient_clip_value=5)
    init_one = Gaussian(scale=0.01)

    l1 = Conv((11, 11, 64), strides=4, padding=3,
              init=init_one, bias=Constant(0), activation=Rectlin())
    l2 = Affine(nout=4096, init=init_one,
                bias=Constant(1), activation=Rectlin())
    l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh())
    l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh())
    layers = [l1, l2, l3, l4]
    layer_list = []
    for layer in layers:
        if isinstance(layer, list):
            layer_list.extend(layer)
        else:
            layer_list.append(layer)
    for l in layer_list:
        l.configure(in_obj=(16, 28, 28))
        l.allocate()
    # separate layer_list into two, the last two recurrent layers and the rest
    layer_list1, layer_list2 = layer_list[:-2], layer_list[-2:]
    opt = MultiOptimizer({'default': opt_gdm,
                          'Bias': opt_ada,
                          'Convolution': opt_adam,
                          'Convolution_bias': opt_adam,
                          'Linear': opt_rms,
                          'LSTM': opt_rms_1,
                          'GRU': opt_rms_1})
    layers_to_optimize1 = [l for l in layer_list1 if isinstance(l, ParameterLayer)]
    layers_to_optimize2 = [l for l in layer_list2 if isinstance(l, ParameterLayer)]
    opt.optimize(layers_to_optimize1, 0)
    assert opt.map_list[opt_adam][0].__class__.__name__ is 'Convolution_bias'
    assert opt.map_list[opt_rms][0].__class__.__name__ == 'Linear'
    opt.optimize(layers_to_optimize2, 0)
    assert opt.map_list[opt_rms_1][0].__class__.__name__ == 'LSTM'
    assert opt.map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
def test_multi_optimizer(backend_default):
    opt_gdm = GradientDescentMomentum(
        learning_rate=0.001, momentum_coef=0.9, wdecay=0.005)
    opt_ada = Adadelta()
    opt_adam = Adam()
    opt_rms = RMSProp()
    opt_rms_1 = RMSProp(gradient_clip_value=5)
    init_one = Gaussian(scale=0.01)

    l1 = Conv((11, 11, 64), strides=4, padding=3,
              init=init_one, bias=Constant(0), activation=Rectlin())
    l2 = Affine(nout=4096, init=init_one,
                bias=Constant(1), activation=Rectlin())
    l3 = LSTM(output_size=1000, init=init_one, activation=Logistic(), gate_activation=Tanh())
    l4 = GRU(output_size=100, init=init_one, activation=Logistic(), gate_activation=Tanh())
    layers = [l1, l2, l3, l4]
    layer_list = []
    for layer in layers:
        if isinstance(layer, list):
            layer_list.extend(layer)
        else:
            layer_list.append(layer)

    opt = MultiOptimizer({'default': opt_gdm,
                          'Bias': opt_ada,
                          'Convolution': opt_adam,
                          'Linear': opt_rms,
                          'LSTM': opt_rms_1,
                          'GRU': opt_rms_1})

    map_list = opt.map_optimizers(layer_list)
    assert map_list[opt_adam][0].__class__.__name__ == 'Convolution'
    assert map_list[opt_ada][0].__class__.__name__ == 'Bias'
    assert map_list[opt_rms][0].__class__.__name__ == 'Linear'
    assert map_list[opt_gdm][0].__class__.__name__ == 'Activation'
    assert map_list[opt_rms_1][0].__class__.__name__ == 'LSTM'
    assert map_list[opt_rms_1][1].__class__.__name__ == 'GRU'
示例#8
0
文件: word_lstm.py 项目: ydm2011/neon
                 tokenizer=tokenizer,
                 onehot_input=False)
valid_set = Text(time_steps,
                 valid_path,
                 vocab=train_set.vocab,
                 tokenizer=tokenizer,
                 onehot_input=False)

# weight initialization
init = Uniform(low=-0.1, high=0.1)

# model initialization
rlayer_params = {
    "output_size": hidden_size,
    "init": init,
    "activation": Tanh(),
    "gate_activation": Logistic()
}
if args.rlayer_type == 'lstm':
    rlayer1, rlayer2 = LSTM(**rlayer_params), LSTM(**rlayer_params)
else:
    rlayer1, rlayer2 = GRU(**rlayer_params), GRU(**rlayer_params)

layers = [
    LookupTable(vocab_size=len(train_set.vocab),
                embedding_dim=hidden_size,
                init=init), rlayer1, rlayer2,
    Affine(len(train_set.vocab), init, bias=init, activation=Softmax())
]

cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))
示例#9
0
def test_reshape_layer_model(backend_default, fargs):
    """
    test cases:
    - conv before RNNs
    - conv after RNNs
    - conv after LUT
    """
    np.random.seed(seed=0)

    nin, nout, bsz = fargs
    be = backend_default
    be.bsz = bsz
    input_size = (nin, be.bsz)

    init = Uniform(-0.1, 0.1)
    g_uni = GlorotUniform()

    inp_np = np.random.rand(nin, be.bsz)
    delta_np = np.random.rand(nout, be.bsz)

    inp = be.array(inp_np)
    delta = be.array(delta_np)

    conv_lut_1 = [
        LookupTable(vocab_size=2000, embedding_dim=400, init=init),
        Reshape(reshape=(4, 100, -1)),
        Conv((3, 3, 16), init=init),
        LSTM(64,
             g_uni,
             activation=Tanh(),
             gate_activation=Logistic(),
             reset_cells=True),
        RecurrentSum(),
        Affine(nout, init, bias=init, activation=Softmax())
    ]

    conv_lut_2 = [
        LookupTable(vocab_size=1000, embedding_dim=400, init=init),
        Reshape(reshape=(4, 50, -1)),
        Conv((3, 3, 16), init=init),
        Pooling(2, strides=2),
        Affine(nout=nout, init=init, bias=init, activation=Softmax()),
    ]

    conv_rnn_1 = [
        LookupTable(vocab_size=2000, embedding_dim=400, init=init),
        LSTM(64,
             g_uni,
             activation=Tanh(),
             gate_activation=Logistic(),
             reset_cells=True),
        Reshape(reshape=(4, 32, -1)),
        Conv((3, 3, 16), init=init),
        Affine(nout, init, bias=init, activation=Softmax())
    ]

    conv_rnn_2 = [
        LookupTable(vocab_size=2000, embedding_dim=400, init=init),
        Recurrent(64, g_uni, activation=Tanh(), reset_cells=True),
        Reshape(reshape=(4, -1, 32)),
        Conv((3, 3, 16), init=init),
        Affine(nout, init, bias=init, activation=Softmax())
    ]

    lut_sum_1 = [
        LookupTable(vocab_size=1000, embedding_dim=128, init=init),
        RecurrentSum(),
        Affine(nout=nout, init=init, bias=init, activation=Softmax()),
    ]

    lut_birnn_1 = [
        LookupTable(vocab_size=1000, embedding_dim=200, init=init),
        DeepBiRNN(32,
                  init=GlorotUniform(),
                  batch_norm=True,
                  activation=Tanh(),
                  reset_cells=True,
                  depth=1),
        Reshape((4, 32, -1)),
        Conv((3, 3, 16), init=init),
        Affine(nout=nout, init=init, bias=init, activation=Softmax())
    ]

    layers_test = [
        conv_lut_1, conv_lut_2, conv_rnn_1, conv_rnn_2, lut_sum_1, lut_birnn_1
    ]

    for lg in layers_test:
        model = Model(layers=lg)
        cost = GeneralizedCost(costfunc=CrossEntropyBinary())
        model.initialize(input_size, cost)
        model.fprop(inp)
        model.bprop(delta)
示例#10
0
def check_lstm(seq_len,
               input_size,
               hidden_size,
               batch_size,
               init_func,
               inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    hidden_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon LSTM
    lstm = LSTM(hidden_size,
                init_func,
                activation=Tanh(),
                gate_activation=Logistic())

    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = lstm.be.array(inp)
    # import pdb; pdb.set_trace()
    # run neon fprop
    lstm.fprop(inpa)

    # reference numpy LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size)

    # make ref weights and biases with neon model
    WLSTM[0, :] = lstm.b.get().T
    WLSTM[1:input_size + 1, :] = lstm.W_input.get().T
    WLSTM[input_size + 1:] = lstm.W_recur.get().T

    # transpose input X and do fprop
    inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size)
    (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM)

    # the output needs transpose as well
    Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T
    IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size,
                                             hidden_size * 4).T
    Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T

    # compare results
    print '====Verifying IFOG===='
    allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying cell states===='
    allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying hidden states===='
    allclose_with_out(lstm.h_buffer.get(), Hout_ref, rtol=0.0, atol=1.0e-5)

    print 'fprop is verified'

    # now test the bprop
    # generate random deltas tensor
    deltas = np.random.randn(*hidden_shape)

    lstm.bprop(lstm.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = lstm.dW_input.get()
    dWrecur_neon = lstm.dW_recur.get()
    db_neon = lstm.db.get()

    # import pdb; pdb.set_trace()
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size)
    (dX_ref, dWLSTM_ref, dc0_ref,
     dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache)
    dWrecur_ref = dWLSTM_ref[-hidden_size:, :]
    dWinput_ref = dWLSTM_ref[1:input_size + 1, :]
    db_ref = dWLSTM_ref[0, :]
    dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T

    # compare results
    print 'Making sure neon LSTM match numpy LSTM in bprop'
    print '====Verifying update on W_recur===='

    assert allclose_with_out(dWrecur_neon,
                             dWrecur_ref.T,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on W_input===='
    assert allclose_with_out(dWinput_neon,
                             dWinput_ref.T,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on bias===='
    assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying output delta===='
    assert allclose_with_out(lstm.out_deltas_buffer.get(),
                             dX_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print 'bprop is verified'

    return
示例#11
0
# this data, splits into individual words.  This can be passed into the Text
# object during dataset creation as seen below.
def tokenizer(s):
    return s.replace('\n', '<eos>').split()

# load data and parse on word-level
train_set = Text(time_steps, train_path, tokenizer=tokenizer, onehot_input=False)
valid_set = Text(time_steps, valid_path, vocab=train_set.vocab, tokenizer=tokenizer,
                 onehot_input=False)

# weight initialization
init = Uniform(low=-0.1, high=0.1)

# model initialization
rlayer_params = {"output_size": hidden_size, "init": init,
                 "activation": Tanh(), "gate_activation": Logistic()}
if args.rlayer_type == 'lstm':
    rlayer1, rlayer2 = LSTM(**rlayer_params), LSTM(**rlayer_params)
else:
    rlayer1, rlayer2 = GRU(**rlayer_params), GRU(**rlayer_params)

layers = [
    LookupTable(vocab_size=len(train_set.vocab), embedding_dim=hidden_size, init=init),
    rlayer1,
    rlayer2,
    Affine(len(train_set.vocab), init, bias=init, activation=Softmax())
]

cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))

model = Model(layers=layers)
示例#12
0
def test_beamsearch(backend_default):
    """
    Simlulated beam search on a minibatch of 2, for 4 time steps. The
    LSTM states are real but the "softmax outputs" z are hardcoded and
    not taken from  the network.
    There are 6 tokens the network outputs, and they have probabilities
    like exp(1), exp(5), exp(7)

    The test asserts that the score_lists assigned by _beamsearch_step(z_list)
    are equal to the probabilities computed manually adding probabilities
    to z_list.
    """
    be = backend_default

    batch_size = 2
    be.bsz = batch_size
    time_steps = 4
    nout = 6
    num_beams = 3

    # create unused layers
    activation = Tanh()
    gate_activation = Logistic()
    init_ary = np.eye(nout)
    init = Array(init_ary)
    encoder = LSTM(nout,
                   init,
                   activation=activation,
                   gate_activation=gate_activation,
                   name="Enc")
    decoder = LSTM(nout,
                   init,
                   activation=activation,
                   gate_activation=gate_activation,
                   name="Dec")

    class DummyFProp():
        """
        Constructs an artificial beam search example with known correct outputs.
        This is called inside a nested loop over steps, num_life. In the first
        time step there is one life beam, after that, 3 life beams per step.
        There are 4 time steps total. Each beamsearch_step builds one list over
        num_life beams.

        At t=0, the winners for ex0 are 1, 4, 5 (indexed by their position) and
        winners for ex1 are 2,4,5. From there we continue the beam for ex0:
            12, 13, 14              6+2=8 6+3=9  6+2=8
            40, 43, 45  with scores 5+4=9 5+3=8  5+7=12 three new winners 45, 52, 55
            50, 52, 55              5+4=9 5+6=11 5+5=10

        for ex2
            1 4 5  with scores   5 4 7
        we get the three winners 1, 4, 5 and continue (just taking the
        3 in order, no sorting)
            10 12 13 14 (not unique!)  5+2=7  5+2=7  5+3=8
            41 42 43       with scores 4+6=10 4+5=9  4+7=11 winners  43 51 52
            51 52 53                   7+4=11 7+6=13 7+3=10 scores   11 11 13
        continue from the three winners 43 51 52
            431 433 434             11+10=21 11+3=14 11+9=20
            511 512 513 with scores 11+6=17  11+5=16 11+7=18  winners 431 434 520
            520 521 522             13+8=21  13+4=17 13+6=19  scores   21  20  21
        continue from three winners 431 511 513 (going along beams, the matches
        in a beam)
            4310 4312 4313 4314             21+2=23  21+2=23 21+3=24 21+10=31 (not unique!)
            4341 4342 4343      with scores 20+10=30 20+5=25 20+7=27        winners 4314 4341 5204
            5200 5202 5204                  21+8=29  21+6=27 21+10=31       scores    31   30   31
        overall winners are 4314 4341 5204

        """
        def __init__(self):
            self.i = -1
            # t=0
            #                                 X        x  x  <-- winners: 1, 4, 5  (for example 0)
            z = be.array(
                np.exp(np.array([[1, 6, 2, 1, 5, 5], [1, 5, 2, 2, 4, 7]]))).T

            # t=1
            #                                     x  x  x  <-- give we picked 4: new winners 2,3,4
            z1 = be.array(
                np.exp(np.array([[1, 1, 2, 3, 2, 1], [2, 1, 2, 3, 2, 1]]))).T
            #                               x        x     x  <-- give we picked 5:
            #                                                     new winners 0,3,[5]
            #                                                     score 12
            z2 = be.array(
                np.exp(np.array([[4, 1, 2, 3, 1, 7], [2, 6, 5, 7, 2, 4]]))).T
            #                               x     X        X  <-- give we picked 1:
            #                                                     new winners 0,[2],[5]
            #                                                     scores 12, 11
            z3 = be.array(
                np.exp(np.array([[4, 1, 6, 3, 1, 5], [1, 4, 6, 3, 2, 1]]))).T

            # t=2
            # example 0: given constructed (1, 5), score 11: 3, 4; scores 21, 20
            z4 = be.array(
                np.exp(np.array([[1, 1, 2, 10, 9, 1], [2, 10, 2, 3, 9, 1]]))).T
            # example 0: given constructed (5, 5), score 12: none selected from this beam
            z5 = be.array(
                np.exp(np.array([[4, 1, 2, 3, 1, 7], [2, 6, 5, 7, 2, 4]]))).T
            # example 0: given constructed (1, 2), score 12: 1; score 20
            z6 = be.array(
                np.exp(np.array([[4, 8, 6, 3, 1, 5], [8, 4, 6, 3, 1, 1]]))).T

            # t=3
            # example 0: given constructed (1, 5, 4), score 20: 1, score 30
            z7 = be.array(
                np.exp(np.array([[1, 10, 2, 1, 1, 1], [2, 1, 2, 3, 10, 1]]))).T
            # example 0: given constructed (1, 2, 1), score 20: 5, score 30
            z8 = be.array(
                np.exp(np.array([[4, 1, 2, 3, 1, 10], [2, 10, 5, 7, 2, 4]]))).T
            # example 0: given constructed (1, 5, 3), score 21: 4, score 31
            z9 = be.array(
                np.exp(np.array([[4, 8, 6, 3, 10, 5], [8, 4, 6, 3, 10, 1]]))).T

            self.z_list = [z, z1, z2, z3, z4, z5, z6, z7, z8, z9]

        def fprop(self, z, inference=True, init_state=None):
            self.i += 1
            return self.z_list[self.i]

    def final_state():
        return be.zeros_like(decoder.h[-1])

    class InObj(NervanaObject):
        def __init__(self):
            self.shape = (nout, time_steps)
            self.decoder_shape = (nout, time_steps)

    decoder.fprop = DummyFProp().fprop
    layers = Seq2Seq([encoder, decoder], decoder_connections=[0])
    layers.decoder._recurrent[0].final_state = final_state

    in_obj = InObj()
    layers.configure(in_obj)  # made zeros because zeros have shape
    layers.allocate()
    layers.allocate_deltas(None)
    beamsearch = BeamSearch(layers)
    inputs = be.iobuf(in_obj.shape)
    beamsearch.beamsearch(inputs, num_beams=num_beams)

    ex0 = np.array([[1, 5, 4, 1], [1, 2, 1, 5], [1, 5, 3, 4]])
    ex1 = np.array([[5, 1, 4, 4], [5, 1, 1, 1], [5, 2, 0, 4]])

    # extract all candidates
    examples = reformat_samples(beamsearch, num_beams, batch_size)
    assert allclose_with_out(examples[0], ex0)
    assert allclose_with_out(examples[1], ex1)
示例#13
0
def test_tanh(backend):
    inputs = np.array([0, 1, -2]).reshape((3, 1))
    outputs = np.array([true_tanh(0),
                        true_tanh(1),
                        true_tanh(-2)]).reshape((3, 1))
    compare_tensors(Tanh(), inputs, outputs, tol=1e-7)
示例#14
0
def create_model(dis_model='dc',
                 gen_model='dc',
                 cost_type='wasserstein',
                 noise_type='normal',
                 im_size=64,
                 n_chan=3,
                 n_noise=100,
                 n_gen_ftr=64,
                 n_dis_ftr=64,
                 depth=4,
                 n_extra_layers=0,
                 batch_norm=True,
                 gen_squash=None,
                 dis_squash=None,
                 dis_iters=5,
                 wgan_param_clamp=None,
                 wgan_train_sched=False):
    """
    Create a GAN model and associated GAN cost function for image generation

    Arguments:
        dis_model (str): Discriminator type, can be 'mlp' for a simple MLP or
                         'dc' for a DC-GAN style model. (defaults to 'dc')
        gen_model (str): Generator type, can be 'mlp' for a simple MLP or
                         'dc' for a DC-GAN style model. (defaults to 'dc')
        cost_type (str): Cost type, can be 'original', 'modified' following
                         Goodfellow2014 or 'wasserstein' following Arjovsky2017
                         (defaults to 'wasserstein')
        noise_type (str): Noise distribution, can be 'uniform or' 'normal'
                          (defaults to 'normal')
        im_size (int): Image size (defaults to 64)
        n_chan (int): Number of image channels (defaults to 3)
        n_noise (int): Number of noise dimensions (defaults to 100)
        n_gen_ftr (int): Number of generator feature maps (defaults to 64)
        n_dis_ftr (int): Number of discriminator feature maps (defaults to 64)
        depth (int): Depth of layers in case of MLP (defaults to 4)
        n_extra_layers (int): Number of extra conv layers in case of DC (defaults to 0)
        batch_norm (bool): Enable batch normalization (defaults to True)
        gen_squash (str or None): Squashing function at the end of generator (defaults to None)
        dis_squash (str or None): Squashing function at the end of discriminator (defaults to None)
        dis_iters (int): Number of critics for discriminator (defaults to 5)
        wgan_param_clamp (float or None): In case of WGAN weight clamp value, None for others
        wgan_train_sched (bool): Enable training schedule of number of critics (defaults to False)
    """
    assert dis_model in ['mlp', 'dc'], \
        "Unsupported model type for discriminator net, supported: 'mlp' and 'dc'"
    assert gen_model in ['mlp', 'dc'], \
        "Unsupported model type for generator net, supported: 'mlp' and 'dc'"
    assert cost_type in ['original', 'modified', 'wasserstein'], \
        "Unsupported GAN cost function type, supported: 'original', 'modified' and 'wasserstein'"

    # types of final squashing functions
    squash_func = dict(nosquash=Identity(), sym=Tanh(), asym=Logistic())
    if cost_type == 'wasserstein':
        if gen_model == 'mlp':
            gen_squash = gen_squash or 'nosquash'
        elif gen_model == 'dc':
            gen_squash = gen_squash or 'sym'
        dis_squash = dis_squash or 'nosquash'
    else:  # for all GAN costs other than Wasserstein
        gen_squash = gen_squash or 'sym'
        dis_squash = dis_squash or 'asym'

    assert gen_squash in ['nosquash', 'sym', 'asym'], \
        "Unsupported final squashing function for generator," \
        " supported: 'nosquash', 'sym' and 'asym'"
    assert dis_squash in ['nosquash', 'sym', 'asym'], \
        "Unsupported final squashing function for discriminator," \
        " supported: 'nosquash', 'sym' and 'asym'"

    gfa = squash_func[gen_squash]
    dfa = squash_func[dis_squash]

    # create model layers
    if gen_model == 'mlp':
        gen = create_mlp_generator(im_size,
                                   n_chan,
                                   n_gen_ftr,
                                   depth,
                                   batch_norm=False,
                                   finact=gfa)
        noise_dim = (n_noise, )
    elif gen_model == 'dc':
        gen = create_dc_generator(im_size,
                                  n_chan,
                                  n_noise,
                                  n_gen_ftr,
                                  n_extra_layers,
                                  batch_norm,
                                  finact=gfa)
        noise_dim = (n_noise, 1, 1)

    if dis_model == 'mlp':
        dis = create_mlp_discriminator(im_size,
                                       n_dis_ftr,
                                       depth,
                                       batch_norm=False,
                                       finact=dfa)
    elif dis_model == 'dc':
        dis = create_dc_discriminator(im_size,
                                      n_chan,
                                      n_dis_ftr,
                                      n_extra_layers,
                                      batch_norm,
                                      finact=dfa)
    layers = GenerativeAdversarial(generator=Sequential(gen, name="Generator"),
                                   discriminator=Sequential(
                                       dis, name="Discriminator"))

    return GAN(layers=layers, noise_dim=noise_dim, noise_type=noise_type, k=dis_iters,
               wgan_param_clamp=wgan_param_clamp, wgan_train_sched=wgan_train_sched), \
        GeneralizedGANCost(costfunc=GANCost(func=cost_type))
# load data
train_set = ImageCaption(path=data_path, max_images=-1)

# weight initialization
init = Uniform(low=-0.08, high=0.08)
init2 = Constant(val=train_set.be.array(train_set.bias_init))

# model initialization
image_path = Sequential([Affine(hidden_size, init, bias=Constant(val=0.0))])
sent_path = Sequential([Affine(hidden_size, init, linear_name='sent')])

layers = [
    MergeMultistream(layers=[image_path, sent_path], merge="recurrent"),
    Dropout(keep=0.5),
    LSTM(hidden_size, init, activation=Logistic(), gate_activation=Tanh(), reset_cells=True),
    Affine(train_set.vocab_size, init, bias=init2, activation=Softmax())
]

cost = GeneralizedCostMask(costfunc=CrossEntropyMulti(usebits=True))

# configure callbacks
checkpoint_model_path = "~/image_caption2.pickle"
if args.callback_args['save_path'] is None:
    args.callback_args['save_path'] = checkpoint_model_path

if args.callback_args['serialize'] is None:
    args.callback_args['serialize'] = 1

model = Model(layers=layers)
示例#16
0
文件: test_gru.py 项目: leo-lp/neon-1
def check_gru(seq_len,
              input_size,
              hidden_size,
              batch_size,
              init_func,
              inp_moms=[0.0, 1.0],
              add_init_state=False):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    slice_shape = (hidden_size, batch_size)

    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon GRU
    gru = GRU(hidden_size,
              init_func,
              activation=Tanh(),
              gate_activation=Logistic())

    # generate random input tensor
    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inp_dev = gru.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # run neon fprop
    gru.configure((input_size, seq_len))
    gru.prev_layer = True
    gru.allocate()

    test_buffer = DeltasTree()
    gru.allocate_deltas(test_buffer)
    test_buffer.allocate_buffers()
    gru.set_deltas(test_buffer)

    if add_init_state:
        init_state = np.random.rand(*slice_shape) * inp_moms[1] + inp_moms[0]
        init_state_dev = gru.be.array(init_state)
        gru.fprop(inp_dev, init_state=init_state_dev)
    else:
        gru.fprop(inp_dev)

    # reference numpy GRU
    gru_ref = RefGRU(input_size, hidden_size)
    WGRU = gru_ref.weights

    # make ref weights and biases the same with neon model
    r_range = list(range(hidden_size))
    z_range = list(range(hidden_size, hidden_size * 2))
    c_range = list(range(hidden_size * 2, hidden_size * 3))

    WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range]
    WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range]
    WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range]

    WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range]
    WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range]
    WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range]

    WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range]
    WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range]
    WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range]

    # transpose input X and do fprop
    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(seq_len, batch_size,
                                   input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size,
                                         hidden_size).swapaxes(1, 2)

    if add_init_state:
        init_state_ref = init_state.copy()
        (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list,
         dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref, init_state_ref)
    else:
        (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list,
         dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref)

    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(gru.outputs.get(),
                             h_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    neon_logger.display('fprop is verified')

    # now test the bprop
    neon_logger.display('Making sure neon GRU matches numpy GRU in bprop')
    gru.bprop(gru.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = gru.dW_input.get()
    dWrecur_neon = gru.dW_recur.get()
    db_neon = gru.db.get()
    dWxr_neon = dWinput_neon[r_range]
    dWxz_neon = dWinput_neon[z_range]
    dWxc_neon = dWinput_neon[c_range]
    dWrr_neon = dWrecur_neon[r_range]
    dWrz_neon = dWrecur_neon[z_range]
    dWrc_neon = dWrecur_neon[c_range]
    dbr_neon = db_neon[r_range]
    dbz_neon = db_neon[z_range]
    dbc_neon = db_neon[c_range]

    drzc_neon = gru.rzhcan_delta_buffer.get()
    dr_neon = drzc_neon[r_range]
    dz_neon = drzc_neon[z_range]
    dc_neon = drzc_neon[c_range]

    dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr]
    dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz]
    dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc]
    dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr]
    dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz]
    dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc]
    dbr_ref = dWGRU_ref[gru_ref.dW_ind_br]
    dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz]
    dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc]

    # neon_logger.display '====Verifying hidden deltas ===='
    neon_logger.display('====Verifying r deltas ====')
    assert allclose_with_out(dr_neon, dr_ref_list, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying z deltas ====')
    assert allclose_with_out(dz_neon, dz_ref_list, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying hcan deltas ====')
    assert allclose_with_out(dc_neon, dc_ref_list, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on W_input====')
    neon_logger.display('dWxr')
    assert allclose_with_out(dWxr_neon, dWxr_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWxz')
    assert allclose_with_out(dWxz_neon, dWxz_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWxc')
    assert allclose_with_out(dWxc_neon, dWxc_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on W_recur====')

    neon_logger.display('dWrr')
    assert allclose_with_out(dWrr_neon, dWrr_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWrz')
    assert allclose_with_out(dWrz_neon, dWrz_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dWrc')
    assert allclose_with_out(dWrc_neon, dWrc_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('====Verifying update on bias====')
    neon_logger.display('dbr')
    assert allclose_with_out(dbr_neon, dbr_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dbz')
    assert allclose_with_out(dbz_neon, dbz_ref, rtol=0.0, atol=1.0e-5)
    neon_logger.display('dbc')
    assert allclose_with_out(dbc_neon, dbc_ref, rtol=0.0, atol=1.0e-5)

    neon_logger.display('bprop is verified')

    return
示例#17
0
                 default_dtype=args.datatype)

# download penn treebank
train_path = load_text('ptb-train', path=args.data_dir)
valid_path = load_text('ptb-valid', path=args.data_dir)

# load data and parse on character-level
train_set = Text(time_steps, train_path)
valid_set = Text(time_steps, valid_path, vocab=train_set.vocab)

# weight initialization
init = Uniform(low=-0.08, high=0.08)

# model initialization
layers = [
    Recurrent(hidden_size, init, Tanh()),
    Affine(len(train_set.vocab), init, bias=init, activation=Softmax())
]

cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))

model = Model(layers=layers)

optimizer = RMSProp(clip_gradients=clip_gradients, stochastic_round=args.rounding)

# configure callbacks
callbacks = Callbacks(model, train_set, output_file=args.output_file,
                      valid_set=valid_set, valid_freq=args.validation_freq,
                      progress_bar=args.progress_bar)

# train model
示例#18
0
train_set = TextNMT(time_steps, train_path, get_prev_target=True, onehot_input=False,
                    split='train', dataset=dataset, subset_pct=args.subset_pct)
valid_set = TextNMT(time_steps, train_path, get_prev_target=False, onehot_input=False,
                    split='valid', dataset=dataset)

# weight initialization
init = Uniform(low=-0.08, high=0.08)

# Standard or Conditional encoder / decoder:
encoder = [LookupTable(vocab_size=len(train_set.s_vocab), embedding_dim=embedding_dim,
                       init=init, name="LUT_en")]
decoder = [LookupTable(vocab_size=len(train_set.t_vocab), embedding_dim=embedding_dim,
                       init=init, name="LUT_de")]
decoder_connections = []  # link up recurrent layers
for ii in range(num_layers):
    encoder.append(GRU(hidden_size, init, activation=Tanh(), gate_activation=Logistic(),
                       reset_cells=True, name="GRU1Enc"))
    decoder.append(GRU(hidden_size, init, activation=Tanh(), gate_activation=Logistic(),
                       reset_cells=True, name="GRU1Dec"))
    decoder_connections.append(ii)
decoder.append(Affine(train_set.nout, init, bias=init, activation=Softmax(), name="Affout"))

layers = Seq2Seq([encoder, decoder],
                 decoder_connections=decoder_connections,
                 name="Seq2Seq")

cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))

model = Model(layers=layers)

optimizer = RMSProp(gradient_clip_value=gradient_clip_value, stochastic_round=args.rounding)
示例#19
0
def gradient_calc(seq_len,
                  input_size,
                  hidden_size,
                  batch_size,
                  epsilon=None,
                  rand_scale=None,
                  inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon lstm instance
    lstm = LSTM(hidden_size,
                Gaussian(),
                activation=Tanh(),
                gate_activation=Logistic())
    inpa = lstm.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    lstm.configure((input_size, seq_len))
    lstm.prev_layer = True  # Hack to force allocating a delta buffer
    lstm.allocate()
    lstm.set_deltas([lstm.be.iobuf(lstm.in_shape)])
    out_bl = lstm.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = lstm.bprop(lstm.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_lstm(lstm)
        lstm.allocate()
        out_pos = lstm.fprop(lstm.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_lstm(lstm)
        lstm.allocate()
        out_neg = lstm.fprop(lstm.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale * out_pos)
        loss_neg = np.sum(rand_scale * out_neg)
        # compute the gradient estimate
        grad = 0.5 / float(epsilon) * (loss_pos - loss_neg)

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del lstm
    return (grads_est, deltas_neon)
示例#20
0
           Affine(nout=1, init=init, bias=init, activation=lrelu)] #E primary
branch3 = [b1,Linear(1, init=Constant(val=1.0))] #SUM ECAL

D_layers = Tree([branch1, branch2, branch3], name="Discriminator") #keep weight between branches equal to 1. for now (alphas=(1.,1.,1.) as by default )
# generator using convolution layers
init_gen = Gaussian(scale=0.001)
relu = Rectlin(slope=0)  # relu for generator
pad1 = dict(pad_h=2, pad_w=2, pad_d=2)
str1 = dict(str_h=2, str_w=2, str_d=2)
conv1 = dict(init=init_gen, batch_norm=False, activation=lrelu, padding=pad1, strides=str1, bias=init_gen)
pad2 = dict(pad_h=2, pad_w=2, pad_d=2)
str2 = dict(str_h=2, str_w=2, str_d=2)
conv2 = dict(init=init_gen, batch_norm=False, activation=lrelu, padding=pad2, strides=str2, bias=init_gen)
pad3 = dict(pad_h=0, pad_w=0, pad_d=0)
str3 = dict(str_h=1, str_w=1, str_d=1)
conv3 = dict(init=init_gen, batch_norm=False, activation=Tanh(), padding=pad3, strides=str3, bias=init_gen)
bg = BranchNode("bg")
branchg  = [bg,
            Affine(1024, init=init_gen, bias=init_gen, activation=relu),
            BatchNorm(),
            Affine(8 * 7 * 7 * 7, init=init_gen, bias=init_gen),
            Reshape((8, 7, 7, 7)),
            Deconv((6, 6, 6, 6), **conv1), #14x14x14
            BatchNorm(),
            # Linear(5 * 14 * 14 * 14, init=init),
            # Reshape((5, 14, 14, 14)),
            Deconv((5, 5, 5, 64), **conv2), #27x27x27
            BatchNorm(),
            Conv((3, 3, 3, 1), **conv3)
           ]
示例#21
0
# setup backend
be = gen_backend(**extract_valid_args(args, gen_backend))

# load the bAbI dataset
babi = BABI(path=args.data_dir, task=task, subset=subset)
train_set = QA(*babi.train)
valid_set = QA(*babi.test)

# recurrent layer parameters (default gru)
rlayer_obj = GRU if args.rlayer_type == 'gru' else LSTM
rlayer_params = dict(output_size=100,
                     reset_cells=True,
                     init=GlorotUniform(),
                     init_inner=Orthonormal(0.5),
                     activation=Tanh(),
                     gate_activation=Logistic())

# if using lstm, swap the activation functions
if args.rlayer_type == 'lstm':
    rlayer_params.update(dict(activation=Logistic(), gate_activation=Tanh()))

# lookup layer parameters
lookup_params = dict(vocab_size=babi.vocab_size,
                     embedding_dim=50,
                     init=Uniform(-0.05, 0.05))

# Model construction
story_path = [LookupTable(**lookup_params), rlayer_obj(**rlayer_params)]
query_path = [LookupTable(**lookup_params), rlayer_obj(**rlayer_params)]
示例#22
0
def check_rnn(seq_len,
              input_size,
              hidden_size,
              batch_size,
              init_func,
              inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # ======== create models ========
    # neon RNN
    rnn = Recurrent(hidden_size, init_func, Tanh())

    # reference numpy RNN
    rnn_ref = RefRecurrent(input_size, hidden_size)
    Wxh = rnn_ref.Wxh
    Whh = rnn_ref.Whh
    bh = rnn_ref.bh

    # ========= generate data =================
    # generate random input tensor
    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = rnn.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(seq_len, batch_size,
                                   input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size,
                                         hidden_size).swapaxes(1, 2)

    # ========= running models ==========
    # run neon fprop
    rnn.fprop(inpa)

    # weights are only initialized after doing fprop, so now
    # make ref weights and biases the same with neon model
    Wxh[:] = rnn.W_input.get()
    Whh[:] = rnn.W_recur.get()
    bh[:] = rnn.b.get()

    (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list,
     d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref)

    # now test the bprop
    rnn.bprop(rnn.be.array(deltas))
    # grab the delta W from gradient buffer
    dWxh_neon = rnn.dW_input.get()
    dWhh_neon = rnn.dW_recur.get()
    db_neon = rnn.db.get()

    # comparing outputs
    print '====Verifying hidden states===='
    print allclose_with_out(rnn.h_buffer.get(),
                            h_ref_list,
                            rtol=0.0,
                            atol=1.0e-5)
    print 'fprop is verified'

    print '====Verifying update on W and b ===='
    print 'dWxh'
    assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5)
    print 'dWhh'
    assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying update on bias===='
    print 'db'
    assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5)

    print 'bprop is verified'

    return
示例#23
0
be = gen_backend(**extract_valid_args(args, gen_backend))
be.bsz = 1

# define same model as in train
init_glorot = GlorotUniform()
init_emb = Uniform(low=-0.1 / embedding_dim, high=0.1 / embedding_dim)
nclass = 2
layers = [
    LookupTable(vocab_size=vocab_size,
                embedding_dim=embedding_dim,
                init=init_emb,
                pad_idx=0,
                update=True),
    LSTM(hidden_size,
         init_glorot,
         activation=Tanh(),
         gate_activation=Logistic(),
         reset_cells=True),
    RecurrentSum(),
    Dropout(keep=0.5),
    Affine(nclass, init_glorot, bias=init_glorot, activation=Softmax())
]

# load the weights
print("Initialized the models - ")
model_new = Model(layers=layers)
print("Loading the weights from {0}".format(args.model_weights))

model_new.load_params(args.model_weights)
model_new.initialize(dataset=(sentence_length, batch_size))
示例#24
0
gradient_clip_value = 5

# download shakespeare text
data_path = load_shakespeare(path=args.data_dir)
train_path, valid_path = Text.create_valid_file(data_path)

# load data and parse on character-level
train_set = Text(time_steps, train_path)
valid_set = Text(time_steps, valid_path, vocab=train_set.vocab)

# weight initialization
init = Uniform(low=-0.08, high=0.08)

# model initialization
layers = [
    LSTM(hidden_size, init, activation=Logistic(), gate_activation=Tanh()),
    Affine(len(train_set.vocab), init, bias=init, activation=Softmax())
]
model = Model(layers=layers)

cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))

optimizer = RMSProp(gradient_clip_value=gradient_clip_value,
                    stochastic_round=args.rounding)

# configure callbacks
callbacks = Callbacks(model, eval_set=valid_set, **args.callback_args)

# fit and validate
model.fit(train_set,
          optimizer=optimizer,
示例#25
0
print "Vocab size - ", vocab_size
print "Sentence Length - ", sentence_length
print "# of train sentences", X_train.shape[0]
print "# of test sentence", X_test.shape[0]

train_set = ArrayIterator(X_train, y_train, nclass=2)
valid_set = ArrayIterator(X_test, y_test, nclass=2)

# weight initialization
uni = Uniform(low=-0.1 / embedding_dim, high=0.1 / embedding_dim)
g_uni = GlorotUniform()

if args.rlayer_type == 'lstm':
    rlayer = LSTM(hidden_size,
                  g_uni,
                  activation=Tanh(),
                  gate_activation=Logistic(),
                  reset_cells=True)
elif args.rlayer_type == 'bilstm':
    rlayer = DeepBiLSTM(hidden_size,
                        g_uni,
                        activation=Tanh(),
                        depth=1,
                        gate_activation=Logistic(),
                        reset_cells=True)
elif args.rlayer_type == 'rnn':
    rlayer = Recurrent(hidden_size, g_uni, activation=Tanh(), reset_cells=True)
elif args.rlayer_type == 'birnn':
    rlayer = DeepBiRNN(hidden_size,
                       g_uni,
                       activation=Tanh(),
示例#26
0
文件: char_rnn.py 项目: zmoon111/neon
gradient_clip_value = None

# setup backend
be = gen_backend(**extract_valid_args(args, gen_backend))

# download penn treebank
dataset = PTB(time_steps, path=args.data_dir)
train_set = dataset.train_iter
valid_set = dataset.valid_iter

# weight initialization
init = Uniform(low=-0.08, high=0.08)

# model initialization
layers = [
    Recurrent(hidden_size, init, activation=Tanh()),
    Affine(len(train_set.vocab), init, bias=init, activation=Softmax())
]

cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))

model = Model(layers=layers)

optimizer = RMSProp(gradient_clip_value=gradient_clip_value,
                    stochastic_round=args.rounding)

# configure callbacks
callbacks = Callbacks(model, eval_set=valid_set, **args.callback_args)

# train model
model.fit(train_set,
示例#27
0
                                     seq_len,
                                     return_sequences=return_sequences)
    valid_set = DataIteratorSequence(time_series.test,
                                     seq_len,
                                     return_sequences=return_sequences)

    # define weights initialization
    init = GlorotUniform()  # Uniform(low=-0.08, high=0.08)

    # define model: model is different for the 2 strategies (sequence target or not)
    if return_sequences is True:
        layers = [
            LSTM(hidden,
                 init,
                 activation=Logistic(),
                 gate_activation=Tanh(),
                 reset_cells=False),
            Affine(train_set.nfeatures, init, bias=init, activation=Identity())
        ]
    else:
        layers = [
            LSTM(hidden,
                 init,
                 activation=Logistic(),
                 gate_activation=Tanh(),
                 reset_cells=True),
            RecurrentLast(),
            Affine(train_set.nfeatures, init, bias=init, activation=Identity())
        ]

    model = Model(layers=layers)
示例#28
0
print "Vocab size - ", vocab_size
print "Sentence Length - ", sentence_length
print "# of train sentences", X_train.shape[0]
print "# of test sentence", X_test.shape[0]

train_set = DataIterator(X_train, y_train, nclass=2)
valid_set = DataIterator(X_test, y_test, nclass=2)

# weight initialization
init_emb = Uniform(low=-0.1/embedding_dim, high=0.1/embedding_dim)
init_glorot = GlorotUniform()

layers = [
    LookupTable(vocab_size=vocab_size, embedding_dim=embedding_dim, init=init_emb),
    LSTM(hidden_size, init_glorot, activation=Tanh(),
         gate_activation=Logistic(), reset_cells=True),
    RecurrentSum(),
    Dropout(keep=0.5),
    Affine(2, init_glorot, bias=init_glorot, activation=Softmax())
]

cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))
metric = Accuracy()

model = Model(layers=layers)

optimizer = Adagrad(learning_rate=0.01, clip_gradients=clip_gradients)


# configure callbacks
示例#29
0
                 default_dtype=args.datatype)

# download penn treebank
train_path = load_text('ptb-train', path=args.data_dir)
valid_path = load_text('ptb-valid', path=args.data_dir)

# load data and parse on character-level
train_set = Text(time_steps, train_path)
valid_set = Text(time_steps, valid_path, vocab=train_set.vocab)

# weight initialization
init = Uniform(low=-0.08, high=0.08)

# model initialization
if rlayer_type == 'lstm':
    rlayer = LSTM(hidden_size, init, Logistic(), Tanh())
elif rlayer_type == 'gru':
    rlayer = GRU(hidden_size, init, activation=Tanh(), gate_activation=Logistic())
else:
    raise NotImplementedError('%s layer not implemented' % rlayer_type)

layers = [
    rlayer,
    Affine(len(train_set.vocab), init, bias=init, activation=Softmax())
]

cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))

model = Model(layers=layers)

optimizer = RMSProp(clip_gradients=clip_gradients, stochastic_round=args.rounding)
示例#30
0
# sent2vec network
nhidden = 2400
gradient_clip_norm = 5.0

train_set = SentenceHomogenous(data_file=data_file, sent_name='train', text_name='report_train',
                               nwords=vocab_size_layer, max_len=args.max_len_w,
                               index_from=index_from)


if valid_split and valid_split > 0.0:
    valid_set = SentenceHomogenous(data_file=data_file, sent_name='valid',
                                   text_name='report_valid', nwords=vocab_size_layer,
                                   max_len=args.max_len_w, index_from=index_from)

skip = SkipThought(vocab_size_layer, embed_dim, init_embed_dev, nhidden, rec_layer=GRU,
                   init_rec=Orthonormal(), activ_rec=Tanh(), activ_rec_gate=Logistic(),
                   init_ff=Uniform(low=-0.1, high=0.1), init_const=Constant(0.0))
model = Model(skip)

if args.model_file and os.path.isfile(args.model_file):
    neon_logger.display("Loading saved weights from: {}".format(args.model_file))
    model_dict = load_obj(args.model_file)
    model.deserialize(model_dict, load_states=True)
elif args.model_file:
    neon_logger.display("Unable to find model file {}, restarting training.".
                        format(args.model_file))

cost = Multicost(costs=[GeneralizedCostMask(costfunc=CrossEntropyMulti(usebits=True)),
                        GeneralizedCostMask(costfunc=CrossEntropyMulti(usebits=True))],
                 weights=[1, 1])