Exemplo n.º 1
0
def main(num_epochs=NEPOCH):
    print("Loading data ...")
    snli = SNLI(batch_size=BSIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    W_word_embedding = 
        snli.weight / \
        (numpy.linalg.norm(snli.weight, axis=1).reshape(
            snli.weight.shape[0], 1) + 0.00001)
Exemplo n.º 2
0
def main(num_epochs=NEPOCH):
    print("Loading data ...")
    snli = SNLI(batch_size=BSIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    del snli

    print("Building network ...")
    ########### sentence embedding encoder ###########
    """
    # sentence vector, with each number standing for a word number
    input_var = T.TensorType('int32', [False, False])('sentence_vector')
    input_var.tag.test_value = numpy.hstack((numpy.random.randint(1, 10000, (BSIZE, 20), 'int32'),
                                             numpy.zeros((BSIZE, 5)).astype('int32')))
    input_var.tag.test_value[1, 20:22] = (413, 45)
    l_in = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var)
    
    input_mask = T.TensorType('int32', [False, False])('sentence_mask')
    input_mask.tag.test_value = numpy.hstack((numpy.ones((BSIZE, 20), dtype='int32'),
                                             numpy.zeros((BSIZE, 5), dtype='int32')))
    input_mask.tag.test_value[1, 20:22] = 1
    l_mask = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask)

    # output shape (BSIZE, None, WEDIM)
    l_word_embed = lasagne.layers.EmbeddingLayer(
        l_in,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)
    """

    ########### input layers ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack((numpy.random.randint(1, 10000, (BSIZE, 18), 'int32'),
                                               numpy.zeros((BSIZE, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_h)
    
    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones((BSIZE, 18), dtype='int32'),
                                                numpy.zeros((BSIZE, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_h)
    
    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack((numpy.random.randint(1, 10000, (BSIZE, 16), 'int32'),
                                               numpy.zeros((BSIZE, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_p)
    
    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones((BSIZE, 16), dtype='int32'),
                                                numpy.zeros((BSIZE, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_p)
    ###################################

    # output shape (BSIZE, None, WEDIM)
    l_hypo_embed = lasagne.layers.EmbeddingLayer(
        l_in_h,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)
    
    l_prem_embed = lasagne.layers.EmbeddingLayer(
        l_in_p,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=l_hypo_embed.W)

    # ATTEND
    l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_embed, p=DPOUT, rescale=True)
    l_hypo_embed_hid1 = DenseLayer3DInput(
        l_hypo_embed_dpout, num_units=EMBDHIDA, nonlinearity=lasagne.nonlinearities.rectify)
    l_hypo_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_hypo_embed_hid1, p=DPOUT, rescale=True)
    l_hypo_embed_hid2 = DenseLayer3DInput(
        l_hypo_embed_hid1_dpout, num_units=EMBDHIDB, nonlinearity=lasagne.nonlinearities.rectify)

    l_prem_embed_dpout = lasagne.layers.DropoutLayer(l_prem_embed, p=DPOUT, rescale=True)
    l_prem_embed_hid1 = DenseLayer3DInput(
        l_prem_embed_dpout, num_units=EMBDHIDA, nonlinearity=lasagne.nonlinearities.rectify)
    l_prem_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_prem_embed_hid1, p=DPOUT, rescale=True)
    l_prem_embed_hid2 = DenseLayer3DInput(
        l_prem_embed_hid1_dpout, num_units=EMBDHIDB, nonlinearity=lasagne.nonlinearities.rectify)
    
    # output dim: (BSIZE, NROWx, NROWy)
    l_e = ComputeEmbeddingPool([l_hypo_embed_hid2, l_prem_embed_hid2])
    # output dim: (BSIZE, NROWy, DIM)
    l_hypo_weighted = AttendOnEmbedding([l_hypo_embed, l_e], masks=[l_mask_h, l_mask_p], direction='col')
    # output dim: (BSIZE, NROWx, DIM)
    l_prem_weighted = AttendOnEmbedding([l_prem_embed, l_e], masks=[l_mask_h, l_mask_p], direction='row')

    # COMPARE
    # output dim: (BSIZE, NROW, 4*LSTMHID)
    l_hypo_premwtd = lasagne.layers.ConcatLayer([l_hypo_embed, l_prem_weighted], axis=2)
    l_prem_hypowtd = lasagne.layers.ConcatLayer([l_prem_embed, l_hypo_weighted], axis=2)

    l_hypo_premwtd_dpout = lasagne.layers.DropoutLayer(l_hypo_premwtd, p=DPOUT, rescale=True)
    l_hypo_comphid1 = DenseLayer3DInput(
        l_hypo_premwtd_dpout, num_units=COMPHIDA, nonlinearity=lasagne.nonlinearities.rectify)
    
    l_hypo_comphid1_dpout = lasagne.layers.DropoutLayer(l_hypo_comphid1, p=DPOUT, rescale=True)
    l_hypo_comphid2 = DenseLayer3DInput(
        l_hypo_comphid1_dpout, num_units=COMPHIDB, nonlinearity=lasagne.nonlinearities.rectify)

    l_prem_hypowtd_dpout = lasagne.layers.DropoutLayer(l_prem_hypowtd, p=DPOUT, rescale=True)
    l_prem_comphid1 = DenseLayer3DInput(
        l_prem_hypowtd_dpout, num_units=COMPHIDA,
        W=l_hypo_comphid1.W, b=l_hypo_comphid1.b, nonlinearity=lasagne.nonlinearities.rectify)
    l_prem_comphid1_dpout = lasagne.layers.DropoutLayer(l_prem_comphid1, p=DPOUT, rescale=True)
    l_prem_comphid2 = DenseLayer3DInput(
        l_prem_comphid1_dpout, num_units=COMPHIDB,
        W=l_hypo_comphid2.W, b=l_hypo_comphid2.b, nonlinearity=lasagne.nonlinearities.rectify)

    # AGGREGATE
    # output dim: (BSIZE, 4*LSTMHID)
    l_hypo_mean = MeanOverDim(l_hypo_comphid2, mask=l_mask_h, dim=1)
    l_prem_mean = MeanOverDim(l_prem_comphid2, mask=l_mask_p, dim=1)

    l_v1v2 = lasagne.layers.ConcatLayer([l_hypo_mean, l_prem_mean], axis=1)

    l_v1v2_dpout = lasagne.layers.DropoutLayer(l_v1v2, p=DPOUT, rescale=True)
    l_outhid = lasagne.layers.DenseLayer(
        l_v1v2_dpout, num_units=OUTHID, nonlinearity=lasagne.nonlinearities.rectify)

    l_outhid_dpout = lasagne.layers.DropoutLayer(l_outhid, p=DPOUT, rescale=True)
    l_output = lasagne.layers.DenseLayer(
        l_outhid_dpout, num_units=3, nonlinearity=lasagne.nonlinearities.softmax)


    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([1,] * BSIZE, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_prediction = T.argmax(network_output, axis=1)
    error_rate = T.mean(T.neq(network_prediction, target_values))
    
    network_output_clean = lasagne.layers.get_output(l_output, deterministic=True) 
    network_prediction_clean = T.argmax(network_output_clean, axis=1) 
    error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values)) 

    cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values))
    cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values))

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output)
    if not UPDATEWE:
        all_params.remove(l_hypo_embed.W)

    numparams = sum([numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams))
    print("-----------------------------------------------------------------")
    for item in all_params:
        print("{0:24}{1:24}{2}".format(item, item.shape.eval(), numpy.prod(item.shape.eval())))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)
   
    # Compute SGD updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LR)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function(
        [l_in_h.input_var, l_mask_h.input_var,
         l_in_p.input_var, l_mask_p.input_var, target_values],
        [cost, error_rate], updates=updates)
        # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
    compute_cost = theano.function(
        [l_in_h.input_var, l_mask_h.input_var,
         l_in_p.input_var, l_mask_p.input_var, target_values],
        [cost_clean, error_rate_clean])
        # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))

    def evaluate(mode):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches
        
        set_cost = 0.
        set_error_rate = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _error = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \
                             1.0 / batches_seen * _error
        
        return set_cost, set_error_rate
    
    print("Done. Evaluating scratch model ...")
    dev_set_cost,  dev_set_error  = evaluate('dev')
    print("BEFORE TRAINING: dev cost %f, error %f" % (dev_set_cost,  dev_set_error))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_error = 0.
            start = time.time()
            
            for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(train_batches, 1):
                _cost, _error = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \
                                  1.0 / batches_seen * _error
                if batches_seen % 100 == 0:
                    end = time.time()
                    print("Sample %d %.2fs, lr %.4f, train cost %f, error %f"  % (
                        batches_seen * BSIZE,
                        end - start,
                        LR,
                        train_set_cost,
                        train_set_error))
                    start = end

                if batches_seen % 2000 == 0:
                    dev_set_cost,  dev_set_error  = evaluate('dev')
                    print("***dev cost %f, error %f" % (dev_set_cost,  dev_set_error))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(all_param_values,
                         open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            dev_set_cost,  dev_set_error  = evaluate('dev')
            test_set_cost, test_set_error = evaluate('test')

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         error train %f dev %f test %f" % (
                epoch,
                train_set_cost,     dev_set_cost,   test_set_cost,
                train_set_error,    dev_set_error,  test_set_error))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass
Exemplo n.º 3
0
def main(num_epochs=NUM_EPOCHS):
    print("Loading data ...")
    snli = SNLI(batch_size=BATCH_SIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    del snli

    print("Building network ...")
    ########### sentence embedding encoder ###########
    # sentence vector, with each number standing for a word number
    input_var = T.TensorType('int32', [False, False])('sentence_vector')
    input_var.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (50, 20), 'int32'), numpy.zeros(
            (50, 5)).astype('int32')))
    input_var.tag.test_value[1, 20:22] = (413, 45)
    l_in = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                     input_var=input_var)

    input_mask = T.TensorType('int32', [False, False])('sentence_mask')
    input_mask.tag.test_value = numpy.hstack((numpy.ones(
        (50, 20), dtype='int32'), numpy.zeros((50, 5), dtype='int32')))
    input_mask.tag.test_value[1, 20:22] = 1
    l_mask = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                       input_var=input_mask)

    # output shape (BATCH_SIZE, None, WE_DIM)
    l_word_embed = lasagne.layers.EmbeddingLayer(
        l_in,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)  # how to set it to be non-trainable?

    # bidirectional LSTM
    l_forward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTM_HIDDEN,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GRAD_CLIP)

    l_backward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTM_HIDDEN,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GRAD_CLIP,
        backwards=True)

    # output dim: (BATCH_SIZE, None, 2*LSTM_HIDDEN)
    l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2)

    # Attention mechanism to get sentence embedding
    # output dim: (BATCH_SIZE, None, ATTENTION_HIDDEN)
    l_ws1 = DenseLayer3DInput(l_concat, num_units=ATTENTION_HIDDEN)
    # output dim: (BATCH_SIZE, None, N_ROWS)
    l_ws2 = DenseLayer3DInput(l_ws1, num_units=N_ROWS, nonlinearity=None)
    l_annotations = Softmax3D(l_ws2, mask=l_mask)
    # output dim: (BATCH_SIZE, 2*LSTM_HIDDEN, N_ROWS)
    l_sentence_embedding = ApplyAttention([l_annotations, l_concat])

    # beam search? Bi lstm in the sentence embedding layer? etc.

    ########### get embeddings for hypothesis and premise ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (50, 18), 'int32'), numpy.zeros(
            (50, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                       input_var=input_var_h)

    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones(
        (50, 18), dtype='int32'), numpy.zeros((50, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                         input_var=input_mask_h)

    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (50, 16), 'int32'), numpy.zeros(
            (50, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                       input_var=input_var_p)

    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones(
        (50, 16), dtype='int32'), numpy.zeros((50, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                         input_var=input_mask_p)

    hypothesis_embedding, hypothesis_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_h.input_var,
            l_mask: l_mask_h.input_var
        })
    premise_embedding, premise_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_p.input_var,
            l_mask: l_mask_p.input_var
        })

    ########### gated encoder and output MLP ##########
    l_hypo_embed = lasagne.layers.InputLayer(shape=(BATCH_SIZE, N_ROWS,
                                                    2 * LSTM_HIDDEN),
                                             input_var=hypothesis_embedding)
    l_pre_embed = lasagne.layers.InputLayer(shape=(BATCH_SIZE, N_ROWS,
                                                   2 * LSTM_HIDDEN),
                                            input_var=premise_embedding)

    # output dim: (BATCH_SIZE, 2*LSTM_HIDDEN, N_ROWS)
    l_factors = GatedEncoder3D([l_hypo_embed, l_pre_embed],
                               num_hfactors=2 * LSTM_HIDDEN)

    # Dropout:
    l_factors_noise = lasagne.layers.DropoutLayer(l_factors,
                                                  p=GAEREG,
                                                  rescale=True)

    # l_hids = DenseLayer3DWeight()

    l_outhid = lasagne.layers.DenseLayer(
        l_factors_noise,
        num_units=OUT_HIDDEN,
        nonlinearity=lasagne.nonlinearities.rectify)

    # Dropout:
    l_outhid_noise = lasagne.layers.DropoutLayer(l_outhid,
                                                 p=GAEREG,
                                                 rescale=True)

    l_output = lasagne.layers.DenseLayer(
        l_outhid_noise,
        num_units=3,
        nonlinearity=lasagne.nonlinearities.softmax)

    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([
        1,
    ] * 50, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_output_clean = lasagne.layers.get_output(l_output,
                                                     deterministic=True)

    # penalty term and cost
    attention_penalty = T.mean(
        (
            T.batched_dot(
                hypothesis_annotation,
                # pay attention to this line:
                # T.extra_ops.cpu_contiguous(hypothesis_annotation.dimshuffle(0, 2, 1))
                hypothesis_annotation.dimshuffle(0, 2, 1)) -
            T.eye(hypothesis_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
        axis=(0, 1, 2)
    ) + T.mean(
        (
            T.batched_dot(
                premise_annotation,
                # T.extra_ops.cpu_contiguous(premise_annotation.dimshuffle(0, 2, 1))  # ditto.
                premise_annotation.dimshuffle(0, 2, 1)  # ditto.
            ) - T.eye(premise_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
        axis=(0, 1, 2))

    cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values) + \
                  ATTENTION_PENALTY * attention_penalty)
    cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values) + \
                        ATTENTION_PENALTY * attention_penalty)

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output) + \
                 lasagne.layers.get_all_params(l_sentence_embedding)
    numparams = sum(
        [numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}".format(numparams))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)

    # withoutwe_params = all_params + [l_word_embed.W]

    # Compute updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    network_prediction = T.argmax(network_output, axis=1)
    error_rate = T.mean(T.neq(network_prediction, target_values))
    network_prediction_clean = T.argmax(network_output_clean, axis=1)
    error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values))

    train = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost, error_rate],
                            updates=updates)
    compute_cost = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost_clean, error_rate_clean])

    def evaluate(mode):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches

        set_cost = 0.
        set_error_rate = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _error = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \
                             1.0 / batches_seen * _error

        return set_cost, set_error_rate

    dev_set_cost, dev_set_error = evaluate('dev')
    print("BEFORE TRAINING: dev cost %f, error %f" %
          (dev_set_cost, dev_set_error))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_error = 0.
            start = time.time()

            for batches_seen, (hypo, hm, premise, pm,
                               truth) in enumerate(train_batches, 1):
                _cost, _error = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \
                                       1.0 / batches_seen * _error
                if batches_seen % 100 == 0:
                    end = time.time()
                    print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" %
                          (batches_seen * BATCH_SIZE, LEARNING_RATE,
                           end - start, train_set_cost, train_set_error))
                    start = end

                if batches_seen % 2000 == 0:
                    dev_set_cost, dev_set_error = evaluate('dev')
                    test_set_cost, test_set_error = evaluate('test')
                    print("***dev  cost %f, error %f" %
                          (dev_set_cost, dev_set_error))
                    print("***test cost %f, error %f" %
                          (test_set_cost, test_set_error))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(
                all_param_values,
                open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            # load params
            # all_param_values = cPickle.load(open('params' + os.sep + 'params_' + filename, 'rb'))
            # for p, v in zip(all_params, all_param_values):
            #     p.set_value(v)

            dev_set_cost, dev_set_error = evaluate('dev')
            test_set_cost, test_set_error = evaluate('test')

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         error train %f dev %f test %f" %
                  (epoch, train_set_cost, dev_set_cost, test_set_cost,
                   train_set_error, dev_set_error, test_set_error))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass
Exemplo n.º 4
0
def main(num_epochs=NEPOCH):
    print("Loading data ...")
    snli = SNLI(batch_size=BSIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    W_word_embedding = snli.weight / \
                       (numpy.linalg.norm(snli.weight, axis=1).reshape(snli.weight.shape[0], 1) + \
                        0.00001)
    del snli

    print("Building network ...")
    ########### input layers ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 18),
                              'int32'), numpy.zeros(
                                  (BSIZE, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_h)

    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_h)

    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 16),
                              'int32'), numpy.zeros(
                                  (BSIZE, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_p)

    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_p)
    ###################################

    # output shape (BSIZE, None, WEDIM)
    l_hypo_embed = lasagne.layers.EmbeddingLayer(
        l_in_h,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)

    l_prem_embed = lasagne.layers.EmbeddingLayer(
        l_in_p,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=l_hypo_embed.W)

    # EMBEDING MAPPING: output shape (BSIZE, None, WEMAP)
    l_hypo_reduced_embed = DenseLayer3DInput(l_hypo_embed,
                                             num_units=WEMAP,
                                             W=init.Normal(),
                                             b=init.Constant(0.),
                                             nonlinearity=None)
    l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_reduced_embed,
                                                     p=DPOUT,
                                                     rescale=True)
    l_prem_reduced_embed = DenseLayer3DInput(l_prem_embed,
                                             num_units=WEMAP,
                                             W=init.Normal(),
                                             b=init.Constant(0.),
                                             nonlinearity=None)
    l_prem_embed_dpout = lasagne.layers.DropoutLayer(l_prem_reduced_embed,
                                                     p=DPOUT,
                                                     rescale=True)

    # ATTEND
    l_hypo_embed_hid1 = DenseLayer3DInput(
        l_hypo_embed_dpout,
        num_units=EMBDHIDA,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)
    l_hypo_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_hypo_embed_hid1,
                                                          p=DPOUT,
                                                          rescale=True)
    l_hypo_embed_hid2 = DenseLayer3DInput(
        l_hypo_embed_hid1_dpout,
        num_units=EMBDHIDB,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)

    l_prem_embed_hid1 = DenseLayer3DInput(
        l_prem_embed_dpout,
        num_units=EMBDHIDA,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)
    l_prem_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_prem_embed_hid1,
                                                          p=DPOUT,
                                                          rescale=True)
    l_prem_embed_hid2 = DenseLayer3DInput(
        l_prem_embed_hid1_dpout,
        num_units=EMBDHIDB,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)

    # output dim: (BSIZE, NROWx, NROWy)
    l_e = ComputeEmbeddingPool([l_hypo_embed_hid1, l_prem_embed_hid2])
    # output dim: (BSIZE, NROWy, DIM)
    l_hypo_weighted = AttendOnEmbedding([l_hypo_reduced_embed, l_e],
                                        masks=[l_mask_h, l_mask_p],
                                        direction='col')
    # output dim: (BSIZE, NROWx, DIM)
    l_prem_weighted = AttendOnEmbedding([l_prem_reduced_embed, l_e],
                                        masks=[l_mask_h, l_mask_p],
                                        direction='row')

    # COMPARE
    # output dim: (BSIZE, NROW, 4*LSTMHID)
    l_hypo_premwtd = lasagne.layers.ConcatLayer(
        [l_hypo_reduced_embed, l_prem_weighted], axis=2)
    l_prem_hypowtd = lasagne.layers.ConcatLayer(
        [l_prem_reduced_embed, l_hypo_weighted], axis=2)

    l_hypo_premwtd_dpout = lasagne.layers.DropoutLayer(l_hypo_premwtd,
                                                       p=DPOUT,
                                                       rescale=True)
    l_hypo_comphid1 = DenseLayer3DInput(
        l_hypo_premwtd_dpout,
        num_units=COMPHIDA,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)

    l_hypo_comphid1_dpout = lasagne.layers.DropoutLayer(l_hypo_comphid1,
                                                        p=DPOUT,
                                                        rescale=True)
    l_hypo_comphid2 = DenseLayer3DInput(
        l_hypo_comphid1_dpout,
        num_units=COMPHIDB,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)

    l_prem_hypowtd_dpout = lasagne.layers.DropoutLayer(l_prem_hypowtd,
                                                       p=DPOUT,
                                                       rescale=True)
    l_prem_comphid1 = DenseLayer3DInput(
        l_prem_hypowtd_dpout,
        num_units=COMPHIDA,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)
    l_prem_comphid1_dpout = lasagne.layers.DropoutLayer(l_prem_comphid1,
                                                        p=DPOUT,
                                                        rescale=True)
    l_prem_comphid2 = DenseLayer3DInput(
        l_prem_comphid1_dpout,
        num_units=COMPHIDB,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)

    # AGGREGATE
    # output dim: (BSIZE, 4*LSTMHID)
    l_hypo_mean = MeanOverDim(l_hypo_comphid2, mask=l_mask_h, dim=1)
    l_prem_mean = MeanOverDim(l_prem_comphid2, mask=l_mask_p, dim=1)

    l_v1v2 = lasagne.layers.ConcatLayer([l_hypo_mean, l_prem_mean], axis=1)
    l_v1v2_dpout = lasagne.layers.DropoutLayer(l_v1v2, p=DPOUT, rescale=True)

    l_outhid1 = lasagne.layers.DenseLayer(
        l_v1v2_dpout,
        num_units=OUTHID,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)
    l_outhid1_dpout = lasagne.layers.DropoutLayer(l_outhid1,
                                                  p=DPOUT,
                                                  rescale=True)

    l_outhid2 = lasagne.layers.DenseLayer(
        l_outhid1_dpout,
        num_units=OUTHID,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)
    # l_outhid2_dpout = lasagne.layers.DropoutLayer(l_outhid2, p=DPOUT, rescale=True)

    l_output = lasagne.layers.DenseLayer(
        l_outhid2,
        num_units=3,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.softmax)

    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([
        1,
    ] * BSIZE, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_prediction = T.argmax(network_output, axis=1)
    error_rate = T.mean(T.neq(network_prediction, target_values))

    network_output_clean = lasagne.layers.get_output(l_output,
                                                     deterministic=True)
    network_prediction_clean = T.argmax(network_output_clean, axis=1)
    error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values))

    cost = T.mean(
        T.nnet.categorical_crossentropy(network_output, target_values))
    cost_clean = T.mean(
        T.nnet.categorical_crossentropy(network_output_clean, target_values))

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output)
    if not UPDATEWE:
        all_params.remove(l_hypo_embed.W)

    numparams = sum(
        [numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams))
    print("-----------------------------------------------------------------")
    for item in all_params:
        print("{0:24}{1:24}{2}".format(item, item.shape.eval(),
                                       numpy.prod(item.shape.eval())))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)

    # Compute SGD updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LR)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost, error_rate],
                            updates=updates)
    # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
    compute_cost = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost_clean, error_rate_clean])

    # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))

    def evaluate(mode):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches

        set_cost = 0.
        set_error_rate = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _error = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \
                             1.0 / batches_seen * _error

        return set_cost, set_error_rate

    print("Done. Evaluating scratch model ...")
    dev_set_cost, dev_set_error = evaluate('dev')
    print("BEFORE TRAINING: dev cost %f, error %f" %
          (dev_set_cost, dev_set_error))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_error = 0.
            start = time.time()

            for batches_seen, (hypo, hm, premise, pm,
                               truth) in enumerate(train_batches, 1):
                _cost, _error = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \
                                  1.0 / batches_seen * _error
                if (batches_seen * BSIZE) % 5000 == 0:
                    end = time.time()
                    print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" %
                          (batches_seen * BSIZE, end - start, LR,
                           train_set_cost, train_set_error))
                    start = end

                if (batches_seen * BSIZE) % 100000 == 0:
                    dev_set_cost, dev_set_error = evaluate('dev')
                    print("***dev cost %f, error %f" %
                          (dev_set_cost, dev_set_error))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(
                all_param_values,
                open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            dev_set_cost, dev_set_error = evaluate('dev')
            test_set_cost, test_set_error = evaluate('test')

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         error train %f dev %f test %f" %
                  (epoch, train_set_cost, dev_set_cost, test_set_cost,
                   train_set_error, dev_set_error, test_set_error))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass
def main(num_epochs=NEPOCH):
    print("Loading data ...")
    snli = SNLI(batch_size=BSIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    del snli

    print("Building network ...")
    ########### sentence embedding encoder ###########
    # sentence vector, with each number standing for a word number
    input_var = T.TensorType('int32', [False, False])('sentence_vector')
    input_var.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 20),
                              'int32'), numpy.zeros(
                                  (BSIZE, 5)).astype('int32')))
    input_var.tag.test_value[1, 20:22] = (413, 45)
    l_in = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var)

    input_mask = T.TensorType('int32', [False, False])('sentence_mask')
    input_mask.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 20), dtype='int32'), numpy.zeros((BSIZE, 5), dtype='int32')))
    input_mask.tag.test_value[1, 20:22] = 1
    l_mask = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_mask)

    # output shape (BSIZE, None, WEDIM)
    l_word_embed = lasagne.layers.EmbeddingLayer(
        l_in,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)

    # bidirectional LSTM
    l_forward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTMHID,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GCLIP)

    l_backward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTMHID,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GCLIP,
        backwards=True)

    # output dim: (BSIZE, None, 2*LSTMHID)
    l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2)
    l_concat_dpout = lasagne.layers.DropoutLayer(
        l_concat, p=DPOUT, rescale=True)  # might not need this line

    # Attention mechanism to get sentence embedding
    # output dim: (BSIZE, None, ATTHID)
    l_ws1 = DenseLayer3DInput(l_concat_dpout, num_units=ATTHID)
    l_ws1_dpout = lasagne.layers.DropoutLayer(l_ws1, p=DPOUT, rescale=True)

    # output dim: (BSIZE, None, NROW)
    l_ws2 = DenseLayer3DInput(l_ws1_dpout, num_units=NROW, nonlinearity=None)
    l_annotations = Softmax3D(l_ws2, mask=l_mask)
    # output dim: (BSIZE, 2*LSTMHID, NROW)
    l_sentence_embedding = ApplyAttention([l_annotations, l_concat])

    # beam search? Bi lstm in the sentence embedding layer? etc.

    ########### get embeddings for hypothesis and premise ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 18),
                              'int32'), numpy.zeros(
                                  (BSIZE, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_h)

    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_h)

    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 16),
                              'int32'), numpy.zeros(
                                  (BSIZE, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_p)

    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_p)

    hypothesis_embedding, hypothesis_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_h.input_var,
            l_mask: l_mask_h.input_var
        })
    premise_embedding, premise_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_p.input_var,
            l_mask: l_mask_p.input_var
        })

    hypothesis_embedding_clean, hypothesis_annotation_clean = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_h.input_var,
            l_mask: l_mask_h.input_var
        },
        deterministic=True)
    premise_embedding_clean, premise_annotation_clean = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_p.input_var,
            l_mask: l_mask_p.input_var
        },
        deterministic=True)

    ########### gated encoder and output MLP ##########
    l_hypo_embed = lasagne.layers.InputLayer(shape=(BSIZE, NROW, 2 * LSTMHID),
                                             input_var=hypothesis_embedding)
    l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_embed,
                                                     p=DPOUT,
                                                     rescale=True)
    l_pre_embed = lasagne.layers.InputLayer(shape=(BSIZE, NROW, 2 * LSTMHID),
                                            input_var=premise_embedding)
    l_pre_embed_dpout = lasagne.layers.DropoutLayer(l_pre_embed,
                                                    p=DPOUT,
                                                    rescale=True)

    # output dim: (BSIZE, NROW, 2*LSTMHID)
    l_factors = GatedEncoder3D([l_hypo_embed_dpout, l_pre_embed_dpout],
                               num_hfactors=2 * LSTMHID)
    l_factors_dpout = lasagne.layers.DropoutLayer(l_factors,
                                                  p=DPOUT,
                                                  rescale=True)

    # l_hids = DenseLayer3DWeight()

    l_outhid = lasagne.layers.DenseLayer(
        l_factors_dpout,
        num_units=OUTHID,
        nonlinearity=lasagne.nonlinearities.rectify)
    l_outhid_dpout = lasagne.layers.DropoutLayer(l_outhid,
                                                 p=DPOUT,
                                                 rescale=True)

    l_output = lasagne.layers.DenseLayer(
        l_outhid_dpout,
        num_units=3,
        nonlinearity=lasagne.nonlinearities.softmax)

    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([
        1,
    ] * BSIZE, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_prediction = T.argmax(network_output, axis=1)
    accuracy = T.mean(T.eq(network_prediction, target_values))

    network_output_clean = lasagne.layers.get_output(
        l_output, {
            l_hypo_embed: hypothesis_embedding_clean,
            l_pre_embed: premise_embedding_clean
        },
        deterministic=True)
    network_prediction_clean = T.argmax(network_output_clean, axis=1)
    accuracy_clean = T.mean(T.eq(network_prediction_clean, target_values))

    # penalty term and cost
    attention_penalty = T.mean(
        (T.batched_dot(hypothesis_annotation,
                       hypothesis_annotation.dimshuffle(0, 2, 1)) -
         T.eye(hypothesis_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
        axis=(0, 1, 2)) + T.mean(
            (T.batched_dot(premise_annotation,
                           premise_annotation.dimshuffle(0, 2, 1)) -
             T.eye(premise_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
            axis=(0, 1, 2))

    L2_lstm = ((l_forward.W_in_to_ingate ** 2).sum() + \
               (l_forward.W_hid_to_ingate ** 2).sum() + \
               (l_forward.W_in_to_forgetgate ** 2).sum() + \
               (l_forward.W_hid_to_forgetgate ** 2).sum() + \
               (l_forward.W_in_to_cell ** 2).sum() + \
               (l_forward.W_hid_to_cell ** 2).sum() + \
               (l_forward.W_in_to_outgate ** 2).sum() + \
               (l_forward.W_hid_to_outgate ** 2).sum() + \
               (l_backward.W_in_to_ingate ** 2).sum() + \
               (l_backward.W_hid_to_ingate ** 2).sum() + \
               (l_backward.W_in_to_forgetgate ** 2).sum() + \
               (l_backward.W_hid_to_forgetgate ** 2).sum() + \
               (l_backward.W_in_to_cell ** 2).sum() + \
               (l_backward.W_hid_to_cell ** 2).sum() + \
               (l_backward.W_in_to_outgate ** 2).sum() + \
               (l_backward.W_hid_to_outgate ** 2).sum())
    L2_attention = (l_ws1.W**2).sum() + (l_ws2.W**2).sum()
    L2_gae = (l_factors.Wxf**2).sum() + (l_factors.Wyf**2).sum()
    L2_outputhid = (l_outhid.W**2).sum()
    L2_softmax = (l_output.W**2).sum()
    L2 = L2_lstm + L2_attention + L2_gae + L2_outputhid + L2_softmax

    cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values)) + \
           L2REG * L2
    cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values)) + \
                 L2REG * L2
    if ATTPENALTY != 0.:
        cost = cost + ATTPENALTY * attention_penalty
        cost_clean = cost_clean + ATTPENALTY * attention_penalty

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output) + \
                 lasagne.layers.get_all_params(l_sentence_embedding)
    if not UPDATEWE:
        all_params.remove(l_word_embed.W)

    numparams = sum(
        [numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams))
    print("-----------------------------------------------------------------")
    for item in all_params:
        print("{0:24}{1:24}{2}".format(item, item.shape.eval(),
                                       numpy.prod(item.shape.eval())))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)

    # Compute SGD updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LR)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost, accuracy],
                            updates=updates)
    compute_cost = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost_clean, accuracy_clean])
    predict = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var
    ], network_prediction_clean)

    def evaluate(mode, verbose=False):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches

        set_cost = 0.
        set_accuracy = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _accuracy = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_accuracy = (1.0 - 1.0 / batches_seen) * set_accuracy + \
                             1.0 / batches_seen * _accuracy

        if verbose == True:
            predicted = []
            truth = []
            for batches_seen, (hypo, hm, premise, pm,
                               th) in enumerate(data, 1):
                predicted.append(predict(hypo, hm, premise, pm))
                truth.append(th)
            truth = numpy.concatenate(truth)
            predicted = numpy.concatenate(predicted)
            cm = confusion_matrix(truth, predicted)
            pr_a = cm.trace() * 1.0 / truth.size
            pr_e = ((cm.sum(axis=0)*1.0/truth.size) * \
                    (cm.sum(axis=1)*1.0/truth.size)).sum()
            k = (pr_a - pr_e) / (1 - pr_e)
            print(mode + " set statistics:")
            print("kappa index of agreement: %f" % k)
            print("confusion matrix:")
            print(cm)

        return set_cost, set_accuracy

    print("Done. Evaluating scratch model ...")
    test_set_cost, test_set_accuracy = evaluate('test', verbose=True)
    print("BEFORE TRAINING: dev cost %f, accuracy %f" %
          (test_set_cost, test_set_accuracy))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_accuracy = 0.
            start = time.time()

            for batches_seen, (hypo, hm, premise, pm,
                               truth) in enumerate(train_batches, 1):
                _cost, _accuracy = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_accuracy = (1.0 - 1.0 / batches_seen) * train_set_accuracy + \
                                  1.0 / batches_seen * _accuracy
                if batches_seen % 100 == 0:
                    end = time.time()
                    print(
                        "Sample %d %.2fs, lr %.4f, train cost %f, accuracy %f"
                        % (batches_seen * BSIZE, end - start, LR,
                           train_set_cost, train_set_accuracy))
                    start = end

                if batches_seen % 2000 == 0:
                    dev_set_cost, dev_set_accuracy = evaluate('dev')
                    print("***dev cost %f, accuracy %f" %
                          (dev_set_cost, dev_set_accuracy))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(
                all_param_values,
                open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            dev_set_cost, dev_set_accuracy = evaluate('dev')
            test_set_cost, test_set_accuracy = evaluate('test', verbose=True)

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         accu: train %f dev %f test %f" %
                  (epoch, train_set_cost, dev_set_cost, test_set_cost,
                   train_set_accuracy, dev_set_accuracy, test_set_accuracy))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass