示例#1
0
def main(num_epochs=NEPOCH):
    print("Loading data ...")
    snli = SNLI(batch_size=BSIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    W_word_embedding = 
        snli.weight / \
        (numpy.linalg.norm(snli.weight, axis=1).reshape(
            snli.weight.shape[0], 1) + 0.00001)
示例#2
0
def test():
    from args import conf
    data = SNLI(conf)
    setattr(conf, 'char_vocab_size', len(data.char_vocab))
    setattr(conf, 'word_vocab_size', len(data.TEXT.vocab))
    setattr(conf, 'class_size', len(data.LABEL.vocab))
    setattr(conf, 'max_word_len', data.max_word_len)

    model = BIMPM(conf, data)
    model.load_state_dict(torch.load('results/baseline.pt'))
    model = model.to(conf.device)

    _, acc = evaluate(model, conf, data)
    print(f'test acc: {acc:.3f}')
示例#3
0
def main():
    from args import conf
    print('loading SNLI data...')
    data = SNLI(conf)
    setattr(conf, 'char_vocab_size', len(data.char_vocab))
    setattr(conf, 'word_vocab_size', len(data.TEXT.vocab))
    setattr(conf, 'class_size', len(data.LABEL.vocab))
    setattr(conf, 'max_word_len', data.max_word_len)
    setattr(conf, 'model_time', strftime('%H:%M:%S', gmtime()))

    print('training start!')
    best_model = train(conf, data)

    if not os.path.exists('results'):
        os.makedirs('results')
    torch.save(best_model.state_dict(), 'results/baseline.pt')
    print('training finished!')
示例#4
0
def test():
    data = SNLI(conf)
    conf.char_vocab_size = len(data.char_vocab)
    conf.word_vocab_size = len(data.TEXT.vocab)
    conf.class_size = len(data.LABEL.vocab)
    conf.max_word_len = data.max_word_len

    model = BIMPM(conf, data)
    model.load_state_dict(torch.load('results/baseline.pt'))
    model.word_emb.weight.requires_grad = True
    model = model.to(conf.device).eval()

    batch = next(iter(data.dev_iter))

    output = F.softmax(model(batch.premise, batch.hypothesis), 1)
    original_scores, original_predictions = torch.max(output, 1)
    original_scores = original_scores.detach().cpu().numpy()
    original_predictions = original_predictions.detach().cpu().numpy()

    reduced, removed_indices = get_rawr(
        model,
        batch,
        max_beam_size=rawr_conf.max_beam_size,
        conf_threshold=rawr_conf.conf_threshold,
        p_not_h=False,
    )

    reduced_hypothesis = padding_tensor(
        [torch.LongTensor(r[0]) for r in reduced])
    reduced_hypothesis = reduced_hypothesis.to(conf.device)
    output = F.softmax(model(batch.premise, batch.hypothesis), 1)
    reduced_scores, reduced_predictions = torch.max(output, 1)
    reduced_scores = reduced_scores.detach().cpu().numpy()
    reduced_predictions = reduced_predictions.detach().cpu().numpy()

    print(all(reduced_predictions == original_predictions))
示例#5
0
def main(num_epochs=NEPOCH):
    print("Loading data ...")
    snli = SNLI(batch_size=BSIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    del snli

    print("Building network ...")
    ########### sentence embedding encoder ###########
    """
    # sentence vector, with each number standing for a word number
    input_var = T.TensorType('int32', [False, False])('sentence_vector')
    input_var.tag.test_value = numpy.hstack((numpy.random.randint(1, 10000, (BSIZE, 20), 'int32'),
                                             numpy.zeros((BSIZE, 5)).astype('int32')))
    input_var.tag.test_value[1, 20:22] = (413, 45)
    l_in = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var)
    
    input_mask = T.TensorType('int32', [False, False])('sentence_mask')
    input_mask.tag.test_value = numpy.hstack((numpy.ones((BSIZE, 20), dtype='int32'),
                                             numpy.zeros((BSIZE, 5), dtype='int32')))
    input_mask.tag.test_value[1, 20:22] = 1
    l_mask = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask)

    # output shape (BSIZE, None, WEDIM)
    l_word_embed = lasagne.layers.EmbeddingLayer(
        l_in,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)
    """

    ########### input layers ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack((numpy.random.randint(1, 10000, (BSIZE, 18), 'int32'),
                                               numpy.zeros((BSIZE, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_h)
    
    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones((BSIZE, 18), dtype='int32'),
                                                numpy.zeros((BSIZE, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_h)
    
    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack((numpy.random.randint(1, 10000, (BSIZE, 16), 'int32'),
                                               numpy.zeros((BSIZE, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_p)
    
    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones((BSIZE, 16), dtype='int32'),
                                                numpy.zeros((BSIZE, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_p)
    ###################################

    # output shape (BSIZE, None, WEDIM)
    l_hypo_embed = lasagne.layers.EmbeddingLayer(
        l_in_h,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)
    
    l_prem_embed = lasagne.layers.EmbeddingLayer(
        l_in_p,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=l_hypo_embed.W)

    # ATTEND
    l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_embed, p=DPOUT, rescale=True)
    l_hypo_embed_hid1 = DenseLayer3DInput(
        l_hypo_embed_dpout, num_units=EMBDHIDA, nonlinearity=lasagne.nonlinearities.rectify)
    l_hypo_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_hypo_embed_hid1, p=DPOUT, rescale=True)
    l_hypo_embed_hid2 = DenseLayer3DInput(
        l_hypo_embed_hid1_dpout, num_units=EMBDHIDB, nonlinearity=lasagne.nonlinearities.rectify)

    l_prem_embed_dpout = lasagne.layers.DropoutLayer(l_prem_embed, p=DPOUT, rescale=True)
    l_prem_embed_hid1 = DenseLayer3DInput(
        l_prem_embed_dpout, num_units=EMBDHIDA, nonlinearity=lasagne.nonlinearities.rectify)
    l_prem_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_prem_embed_hid1, p=DPOUT, rescale=True)
    l_prem_embed_hid2 = DenseLayer3DInput(
        l_prem_embed_hid1_dpout, num_units=EMBDHIDB, nonlinearity=lasagne.nonlinearities.rectify)
    
    # output dim: (BSIZE, NROWx, NROWy)
    l_e = ComputeEmbeddingPool([l_hypo_embed_hid2, l_prem_embed_hid2])
    # output dim: (BSIZE, NROWy, DIM)
    l_hypo_weighted = AttendOnEmbedding([l_hypo_embed, l_e], masks=[l_mask_h, l_mask_p], direction='col')
    # output dim: (BSIZE, NROWx, DIM)
    l_prem_weighted = AttendOnEmbedding([l_prem_embed, l_e], masks=[l_mask_h, l_mask_p], direction='row')

    # COMPARE
    # output dim: (BSIZE, NROW, 4*LSTMHID)
    l_hypo_premwtd = lasagne.layers.ConcatLayer([l_hypo_embed, l_prem_weighted], axis=2)
    l_prem_hypowtd = lasagne.layers.ConcatLayer([l_prem_embed, l_hypo_weighted], axis=2)

    l_hypo_premwtd_dpout = lasagne.layers.DropoutLayer(l_hypo_premwtd, p=DPOUT, rescale=True)
    l_hypo_comphid1 = DenseLayer3DInput(
        l_hypo_premwtd_dpout, num_units=COMPHIDA, nonlinearity=lasagne.nonlinearities.rectify)
    
    l_hypo_comphid1_dpout = lasagne.layers.DropoutLayer(l_hypo_comphid1, p=DPOUT, rescale=True)
    l_hypo_comphid2 = DenseLayer3DInput(
        l_hypo_comphid1_dpout, num_units=COMPHIDB, nonlinearity=lasagne.nonlinearities.rectify)

    l_prem_hypowtd_dpout = lasagne.layers.DropoutLayer(l_prem_hypowtd, p=DPOUT, rescale=True)
    l_prem_comphid1 = DenseLayer3DInput(
        l_prem_hypowtd_dpout, num_units=COMPHIDA,
        W=l_hypo_comphid1.W, b=l_hypo_comphid1.b, nonlinearity=lasagne.nonlinearities.rectify)
    l_prem_comphid1_dpout = lasagne.layers.DropoutLayer(l_prem_comphid1, p=DPOUT, rescale=True)
    l_prem_comphid2 = DenseLayer3DInput(
        l_prem_comphid1_dpout, num_units=COMPHIDB,
        W=l_hypo_comphid2.W, b=l_hypo_comphid2.b, nonlinearity=lasagne.nonlinearities.rectify)

    # AGGREGATE
    # output dim: (BSIZE, 4*LSTMHID)
    l_hypo_mean = MeanOverDim(l_hypo_comphid2, mask=l_mask_h, dim=1)
    l_prem_mean = MeanOverDim(l_prem_comphid2, mask=l_mask_p, dim=1)

    l_v1v2 = lasagne.layers.ConcatLayer([l_hypo_mean, l_prem_mean], axis=1)

    l_v1v2_dpout = lasagne.layers.DropoutLayer(l_v1v2, p=DPOUT, rescale=True)
    l_outhid = lasagne.layers.DenseLayer(
        l_v1v2_dpout, num_units=OUTHID, nonlinearity=lasagne.nonlinearities.rectify)

    l_outhid_dpout = lasagne.layers.DropoutLayer(l_outhid, p=DPOUT, rescale=True)
    l_output = lasagne.layers.DenseLayer(
        l_outhid_dpout, num_units=3, nonlinearity=lasagne.nonlinearities.softmax)


    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([1,] * BSIZE, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_prediction = T.argmax(network_output, axis=1)
    error_rate = T.mean(T.neq(network_prediction, target_values))
    
    network_output_clean = lasagne.layers.get_output(l_output, deterministic=True) 
    network_prediction_clean = T.argmax(network_output_clean, axis=1) 
    error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values)) 

    cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values))
    cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values))

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output)
    if not UPDATEWE:
        all_params.remove(l_hypo_embed.W)

    numparams = sum([numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams))
    print("-----------------------------------------------------------------")
    for item in all_params:
        print("{0:24}{1:24}{2}".format(item, item.shape.eval(), numpy.prod(item.shape.eval())))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)
   
    # Compute SGD updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LR)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function(
        [l_in_h.input_var, l_mask_h.input_var,
         l_in_p.input_var, l_mask_p.input_var, target_values],
        [cost, error_rate], updates=updates)
        # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
    compute_cost = theano.function(
        [l_in_h.input_var, l_mask_h.input_var,
         l_in_p.input_var, l_mask_p.input_var, target_values],
        [cost_clean, error_rate_clean])
        # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))

    def evaluate(mode):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches
        
        set_cost = 0.
        set_error_rate = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _error = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \
                             1.0 / batches_seen * _error
        
        return set_cost, set_error_rate
    
    print("Done. Evaluating scratch model ...")
    dev_set_cost,  dev_set_error  = evaluate('dev')
    print("BEFORE TRAINING: dev cost %f, error %f" % (dev_set_cost,  dev_set_error))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_error = 0.
            start = time.time()
            
            for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(train_batches, 1):
                _cost, _error = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \
                                  1.0 / batches_seen * _error
                if batches_seen % 100 == 0:
                    end = time.time()
                    print("Sample %d %.2fs, lr %.4f, train cost %f, error %f"  % (
                        batches_seen * BSIZE,
                        end - start,
                        LR,
                        train_set_cost,
                        train_set_error))
                    start = end

                if batches_seen % 2000 == 0:
                    dev_set_cost,  dev_set_error  = evaluate('dev')
                    print("***dev cost %f, error %f" % (dev_set_cost,  dev_set_error))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(all_param_values,
                         open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            dev_set_cost,  dev_set_error  = evaluate('dev')
            test_set_cost, test_set_error = evaluate('test')

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         error train %f dev %f test %f" % (
                epoch,
                train_set_cost,     dev_set_cost,   test_set_cost,
                train_set_error,    dev_set_error,  test_set_error))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass
示例#6
0
def main():
    from args import conf, tune_conf
    parser = argparse.ArgumentParser()
    parser.add_argument('--baseline', default='results/baseline.pt')
    parser.add_argument(
        '--ent-train',
        default='/scratch0/shifeng/rawr/new_snli/rawr.train.pkl')
    parser.add_argument('--ent-dev',
                        default='/scratch0/shifeng/rawr/new_snli/rawr.dev.pkl')
    args = parser.parse_args()

    out_dir = prepare_output_dir(args, args.root_dir)
    log = logging.getLogger(__name__)
    log.setLevel(logging.DEBUG)
    fh = logging.FileHandler(os.path.join(out_dir, 'output.log'))
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt='%(asctime)s %(message)s',
                                  datefmt='%m/%d/%Y %I:%M:%S')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    log.addHandler(fh)
    log.addHandler(ch)
    log.info('===== {} ====='.format(out_dir))
    ''' load regular data '''
    log.info('loading regular training data')
    data = SNLI(conf)
    conf.char_vocab_size = len(data.char_vocab)
    conf.word_vocab_size = len(data.TEXT.vocab)
    conf.class_size = len(data.LABEL.vocab)
    conf.max_word_len = data.max_word_len

    log.info('loading entropy dev data {}'.format(tune_conf.ent_dev))
    with open(tune_conf.ent_dev, 'rb') as f:
        ent_dev = pickle.load(f)
    if isinstance(ent_dev[0], list):
        ent_dev = list(itertools.chain(*ent_dev))
    log.info('{} entropy dev examples'.format(len(ent_dev)))
    ent_dev = [[
        x['data']['premise'], x['data']['hypothesis'], x['data']['label']
    ] for x in ent_dev]

    log.info('loading entropy training data {}'.format(tune_conf.ent_train))
    with open(tune_conf.ent_train, 'rb') as f:
        ent_train = pickle.load(f)
    if isinstance(ent_train[0], list):
        ent_train = list(itertools.chain(*ent_train))
    log.info('{} entropy training examples'.format(len(ent_train)))
    ent_train = [[
        x['data']['premise'], x['data']['hypothesis'], x['data']['label']
    ] for x in ent_train]

    train_ent_batches = batchify(ent_train, tune_conf.batch_size)
    log.info('{} entropy training batches'.format(len(train_ent_batches)))

    log.info('loading model from {}'.format(args.baseline))
    model = BIMPM(conf, data)
    model.load_state_dict(torch.load(args.baseline))
    # model.word_emb.weight.requires_grad = True
    model.cuda(conf.gpu)

    parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
    optimizer = optim.Adam(parameters, lr=tune_conf.lr)
    ent_optimizer = optim.Adam(parameters, lr=tune_conf.ent_lr)
    criterion = nn.CrossEntropyLoss()

    init_loss, init_acc = evaluate(model, data.dev_iter)
    log.info("initial loss {:.4f} accuracy {:.4f}".format(init_loss, init_acc))
    best_acc = init_acc

    dev_ent_batches = batchify(ent_dev, tune_conf.batch_size)
    init_ent, init_ent_acc = evaluate_ent(model, dev_ent_batches)
    log.info("initial entropy {:.4f} ent_acc {:.4f}".format(
        init_ent, init_ent_acc))

    epoch = 0
    i_ent, i_mle = 0, 0  # number of examples
    train_loss, train_ent = 0, 0
    train_mle_iter = iter(data.train_iter)
    train_ent_iter = iter(train_ent_batches)
    while True:
        model.train()
        for i in range(tune_conf.n_ent):
            try:
                prem, hypo, label = next(train_ent_iter)
            except StopIteration:
                random.shuffle(train_ent_batches)
                train_ent_iter = iter(train_ent_batches)
                i_ent = 0
                train_ent = 0
                break
            output = forward(model, prem, hypo, conf.max_sent_len)
            output = F.softmax(output, 1)
            ent = entropy(output).sum()
            train_ent += ent.data.cpu().numpy()[0]
            loss = -tune_conf.gamma * ent
            ent_optimizer.zero_grad()
            loss.backward()
            ent_optimizer.step()
            i_ent += prem.shape[0]

        end_of_epoch = False
        for i in range(tune_conf.n_mle):
            if i_mle >= len(data.train_iter):
                epoch += 1
                end_of_epoch = True
                data.train_iter.init_epoch()
                train_mle_iter = iter(data.train_iter)
                i_mle = 0
                train_loss = 0
                break
            batch = next(train_mle_iter)
            output = forward(model, batch.premise, batch.hypothesis,
                             conf.max_sent_len)
            loss = criterion(output, batch.label)
            train_loss += loss.data.cpu().numpy()[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            i_mle += batch.premise.shape[0]

        if i_mle % 1000 == 0:
            _loss = train_loss / i_mle if i_mle != 0 else 0
            _ent = train_ent / i_ent if i_ent != 0 else 0
            log.info(
                'epoch [{:2}] [{} / {}] loss[{:.5f}] entropy[{:.5f}]'.format(
                    epoch, i_mle, len(data.train_iter), _loss, _ent))

        if end_of_epoch or i_mle % 1e5 == 0:
            dev_loss, dev_acc = evaluate(model, data.dev_iter)
            dev_ent, dev_ent_acc = evaluate_ent(model, dev_ent_batches)
            log.info("dev acc: {:.4f} ent: {:.4f} ent_acc: {:.4f}".format(
                dev_acc, dev_ent, dev_ent_acc))
            model_path = os.path.join(out_dir,
                                      'checkpoint_epoch_{}.pt'.format(epoch))
            torch.save(model.state_dict(), model_path)
            if dev_acc > best_acc:
                best_acc = dev_acc
                model_path = os.path.join(out_dir, 'best_model.pt')
                torch.save(model.state_dict(), model_path)
                log.info("best model saved {}".format(dev_acc))

        if epoch > 40:
            break
示例#7
0
def main(num_epochs=NUM_EPOCHS):
    print("Loading data ...")
    snli = SNLI(batch_size=BATCH_SIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    del snli

    print("Building network ...")
    ########### sentence embedding encoder ###########
    # sentence vector, with each number standing for a word number
    input_var = T.TensorType('int32', [False, False])('sentence_vector')
    input_var.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (50, 20), 'int32'), numpy.zeros(
            (50, 5)).astype('int32')))
    input_var.tag.test_value[1, 20:22] = (413, 45)
    l_in = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                     input_var=input_var)

    input_mask = T.TensorType('int32', [False, False])('sentence_mask')
    input_mask.tag.test_value = numpy.hstack((numpy.ones(
        (50, 20), dtype='int32'), numpy.zeros((50, 5), dtype='int32')))
    input_mask.tag.test_value[1, 20:22] = 1
    l_mask = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                       input_var=input_mask)

    # output shape (BATCH_SIZE, None, WE_DIM)
    l_word_embed = lasagne.layers.EmbeddingLayer(
        l_in,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)  # how to set it to be non-trainable?

    # bidirectional LSTM
    l_forward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTM_HIDDEN,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GRAD_CLIP)

    l_backward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTM_HIDDEN,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GRAD_CLIP,
        backwards=True)

    # output dim: (BATCH_SIZE, None, 2*LSTM_HIDDEN)
    l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2)

    # Attention mechanism to get sentence embedding
    # output dim: (BATCH_SIZE, None, ATTENTION_HIDDEN)
    l_ws1 = DenseLayer3DInput(l_concat, num_units=ATTENTION_HIDDEN)
    # output dim: (BATCH_SIZE, None, N_ROWS)
    l_ws2 = DenseLayer3DInput(l_ws1, num_units=N_ROWS, nonlinearity=None)
    l_annotations = Softmax3D(l_ws2, mask=l_mask)
    # output dim: (BATCH_SIZE, 2*LSTM_HIDDEN, N_ROWS)
    l_sentence_embedding = ApplyAttention([l_annotations, l_concat])

    # beam search? Bi lstm in the sentence embedding layer? etc.

    ########### get embeddings for hypothesis and premise ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (50, 18), 'int32'), numpy.zeros(
            (50, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                       input_var=input_var_h)

    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones(
        (50, 18), dtype='int32'), numpy.zeros((50, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                         input_var=input_mask_h)

    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (50, 16), 'int32'), numpy.zeros(
            (50, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                       input_var=input_var_p)

    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones(
        (50, 16), dtype='int32'), numpy.zeros((50, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                         input_var=input_mask_p)

    hypothesis_embedding, hypothesis_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_h.input_var,
            l_mask: l_mask_h.input_var
        })
    premise_embedding, premise_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_p.input_var,
            l_mask: l_mask_p.input_var
        })

    ########### gated encoder and output MLP ##########
    l_hypo_embed = lasagne.layers.InputLayer(shape=(BATCH_SIZE, N_ROWS,
                                                    2 * LSTM_HIDDEN),
                                             input_var=hypothesis_embedding)
    l_pre_embed = lasagne.layers.InputLayer(shape=(BATCH_SIZE, N_ROWS,
                                                   2 * LSTM_HIDDEN),
                                            input_var=premise_embedding)

    # output dim: (BATCH_SIZE, 2*LSTM_HIDDEN, N_ROWS)
    l_factors = GatedEncoder3D([l_hypo_embed, l_pre_embed],
                               num_hfactors=2 * LSTM_HIDDEN)

    # Dropout:
    l_factors_noise = lasagne.layers.DropoutLayer(l_factors,
                                                  p=GAEREG,
                                                  rescale=True)

    # l_hids = DenseLayer3DWeight()

    l_outhid = lasagne.layers.DenseLayer(
        l_factors_noise,
        num_units=OUT_HIDDEN,
        nonlinearity=lasagne.nonlinearities.rectify)

    # Dropout:
    l_outhid_noise = lasagne.layers.DropoutLayer(l_outhid,
                                                 p=GAEREG,
                                                 rescale=True)

    l_output = lasagne.layers.DenseLayer(
        l_outhid_noise,
        num_units=3,
        nonlinearity=lasagne.nonlinearities.softmax)

    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([
        1,
    ] * 50, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_output_clean = lasagne.layers.get_output(l_output,
                                                     deterministic=True)

    # penalty term and cost
    attention_penalty = T.mean(
        (
            T.batched_dot(
                hypothesis_annotation,
                # pay attention to this line:
                # T.extra_ops.cpu_contiguous(hypothesis_annotation.dimshuffle(0, 2, 1))
                hypothesis_annotation.dimshuffle(0, 2, 1)) -
            T.eye(hypothesis_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
        axis=(0, 1, 2)
    ) + T.mean(
        (
            T.batched_dot(
                premise_annotation,
                # T.extra_ops.cpu_contiguous(premise_annotation.dimshuffle(0, 2, 1))  # ditto.
                premise_annotation.dimshuffle(0, 2, 1)  # ditto.
            ) - T.eye(premise_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
        axis=(0, 1, 2))

    cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values) + \
                  ATTENTION_PENALTY * attention_penalty)
    cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values) + \
                        ATTENTION_PENALTY * attention_penalty)

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output) + \
                 lasagne.layers.get_all_params(l_sentence_embedding)
    numparams = sum(
        [numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}".format(numparams))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)

    # withoutwe_params = all_params + [l_word_embed.W]

    # Compute updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    network_prediction = T.argmax(network_output, axis=1)
    error_rate = T.mean(T.neq(network_prediction, target_values))
    network_prediction_clean = T.argmax(network_output_clean, axis=1)
    error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values))

    train = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost, error_rate],
                            updates=updates)
    compute_cost = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost_clean, error_rate_clean])

    def evaluate(mode):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches

        set_cost = 0.
        set_error_rate = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _error = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \
                             1.0 / batches_seen * _error

        return set_cost, set_error_rate

    dev_set_cost, dev_set_error = evaluate('dev')
    print("BEFORE TRAINING: dev cost %f, error %f" %
          (dev_set_cost, dev_set_error))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_error = 0.
            start = time.time()

            for batches_seen, (hypo, hm, premise, pm,
                               truth) in enumerate(train_batches, 1):
                _cost, _error = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \
                                       1.0 / batches_seen * _error
                if batches_seen % 100 == 0:
                    end = time.time()
                    print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" %
                          (batches_seen * BATCH_SIZE, LEARNING_RATE,
                           end - start, train_set_cost, train_set_error))
                    start = end

                if batches_seen % 2000 == 0:
                    dev_set_cost, dev_set_error = evaluate('dev')
                    test_set_cost, test_set_error = evaluate('test')
                    print("***dev  cost %f, error %f" %
                          (dev_set_cost, dev_set_error))
                    print("***test cost %f, error %f" %
                          (test_set_cost, test_set_error))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(
                all_param_values,
                open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            # load params
            # all_param_values = cPickle.load(open('params' + os.sep + 'params_' + filename, 'rb'))
            # for p, v in zip(all_params, all_param_values):
            #     p.set_value(v)

            dev_set_cost, dev_set_error = evaluate('dev')
            test_set_cost, test_set_error = evaluate('test')

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         error train %f dev %f test %f" %
                  (epoch, train_set_cost, dev_set_cost, test_set_cost,
                   train_set_error, dev_set_error, test_set_error))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass
示例#8
0
def main(num_epochs=NEPOCH):
    print("Loading data ...")
    snli = SNLI(batch_size=BSIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    W_word_embedding = snli.weight / \
                       (numpy.linalg.norm(snli.weight, axis=1).reshape(snli.weight.shape[0], 1) + \
                        0.00001)
    del snli

    print("Building network ...")
    ########### input layers ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 18),
                              'int32'), numpy.zeros(
                                  (BSIZE, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_h)

    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_h)

    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 16),
                              'int32'), numpy.zeros(
                                  (BSIZE, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_p)

    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_p)
    ###################################

    # output shape (BSIZE, None, WEDIM)
    l_hypo_embed = lasagne.layers.EmbeddingLayer(
        l_in_h,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)

    l_prem_embed = lasagne.layers.EmbeddingLayer(
        l_in_p,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=l_hypo_embed.W)

    # EMBEDING MAPPING: output shape (BSIZE, None, WEMAP)
    l_hypo_reduced_embed = DenseLayer3DInput(l_hypo_embed,
                                             num_units=WEMAP,
                                             W=init.Normal(),
                                             b=init.Constant(0.),
                                             nonlinearity=None)
    l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_reduced_embed,
                                                     p=DPOUT,
                                                     rescale=True)
    l_prem_reduced_embed = DenseLayer3DInput(l_prem_embed,
                                             num_units=WEMAP,
                                             W=init.Normal(),
                                             b=init.Constant(0.),
                                             nonlinearity=None)
    l_prem_embed_dpout = lasagne.layers.DropoutLayer(l_prem_reduced_embed,
                                                     p=DPOUT,
                                                     rescale=True)

    # ATTEND
    l_hypo_embed_hid1 = DenseLayer3DInput(
        l_hypo_embed_dpout,
        num_units=EMBDHIDA,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)
    l_hypo_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_hypo_embed_hid1,
                                                          p=DPOUT,
                                                          rescale=True)
    l_hypo_embed_hid2 = DenseLayer3DInput(
        l_hypo_embed_hid1_dpout,
        num_units=EMBDHIDB,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)

    l_prem_embed_hid1 = DenseLayer3DInput(
        l_prem_embed_dpout,
        num_units=EMBDHIDA,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)
    l_prem_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_prem_embed_hid1,
                                                          p=DPOUT,
                                                          rescale=True)
    l_prem_embed_hid2 = DenseLayer3DInput(
        l_prem_embed_hid1_dpout,
        num_units=EMBDHIDB,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)

    # output dim: (BSIZE, NROWx, NROWy)
    l_e = ComputeEmbeddingPool([l_hypo_embed_hid1, l_prem_embed_hid2])
    # output dim: (BSIZE, NROWy, DIM)
    l_hypo_weighted = AttendOnEmbedding([l_hypo_reduced_embed, l_e],
                                        masks=[l_mask_h, l_mask_p],
                                        direction='col')
    # output dim: (BSIZE, NROWx, DIM)
    l_prem_weighted = AttendOnEmbedding([l_prem_reduced_embed, l_e],
                                        masks=[l_mask_h, l_mask_p],
                                        direction='row')

    # COMPARE
    # output dim: (BSIZE, NROW, 4*LSTMHID)
    l_hypo_premwtd = lasagne.layers.ConcatLayer(
        [l_hypo_reduced_embed, l_prem_weighted], axis=2)
    l_prem_hypowtd = lasagne.layers.ConcatLayer(
        [l_prem_reduced_embed, l_hypo_weighted], axis=2)

    l_hypo_premwtd_dpout = lasagne.layers.DropoutLayer(l_hypo_premwtd,
                                                       p=DPOUT,
                                                       rescale=True)
    l_hypo_comphid1 = DenseLayer3DInput(
        l_hypo_premwtd_dpout,
        num_units=COMPHIDA,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)

    l_hypo_comphid1_dpout = lasagne.layers.DropoutLayer(l_hypo_comphid1,
                                                        p=DPOUT,
                                                        rescale=True)
    l_hypo_comphid2 = DenseLayer3DInput(
        l_hypo_comphid1_dpout,
        num_units=COMPHIDB,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)

    l_prem_hypowtd_dpout = lasagne.layers.DropoutLayer(l_prem_hypowtd,
                                                       p=DPOUT,
                                                       rescale=True)
    l_prem_comphid1 = DenseLayer3DInput(
        l_prem_hypowtd_dpout,
        num_units=COMPHIDA,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)
    l_prem_comphid1_dpout = lasagne.layers.DropoutLayer(l_prem_comphid1,
                                                        p=DPOUT,
                                                        rescale=True)
    l_prem_comphid2 = DenseLayer3DInput(
        l_prem_comphid1_dpout,
        num_units=COMPHIDB,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)

    # AGGREGATE
    # output dim: (BSIZE, 4*LSTMHID)
    l_hypo_mean = MeanOverDim(l_hypo_comphid2, mask=l_mask_h, dim=1)
    l_prem_mean = MeanOverDim(l_prem_comphid2, mask=l_mask_p, dim=1)

    l_v1v2 = lasagne.layers.ConcatLayer([l_hypo_mean, l_prem_mean], axis=1)
    l_v1v2_dpout = lasagne.layers.DropoutLayer(l_v1v2, p=DPOUT, rescale=True)

    l_outhid1 = lasagne.layers.DenseLayer(
        l_v1v2_dpout,
        num_units=OUTHID,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)
    l_outhid1_dpout = lasagne.layers.DropoutLayer(l_outhid1,
                                                  p=DPOUT,
                                                  rescale=True)

    l_outhid2 = lasagne.layers.DenseLayer(
        l_outhid1_dpout,
        num_units=OUTHID,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.rectify)
    # l_outhid2_dpout = lasagne.layers.DropoutLayer(l_outhid2, p=DPOUT, rescale=True)

    l_output = lasagne.layers.DenseLayer(
        l_outhid2,
        num_units=3,
        W=init.Normal(),
        b=init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.softmax)

    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([
        1,
    ] * BSIZE, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_prediction = T.argmax(network_output, axis=1)
    error_rate = T.mean(T.neq(network_prediction, target_values))

    network_output_clean = lasagne.layers.get_output(l_output,
                                                     deterministic=True)
    network_prediction_clean = T.argmax(network_output_clean, axis=1)
    error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values))

    cost = T.mean(
        T.nnet.categorical_crossentropy(network_output, target_values))
    cost_clean = T.mean(
        T.nnet.categorical_crossentropy(network_output_clean, target_values))

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output)
    if not UPDATEWE:
        all_params.remove(l_hypo_embed.W)

    numparams = sum(
        [numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams))
    print("-----------------------------------------------------------------")
    for item in all_params:
        print("{0:24}{1:24}{2}".format(item, item.shape.eval(),
                                       numpy.prod(item.shape.eval())))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)

    # Compute SGD updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LR)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost, error_rate],
                            updates=updates)
    # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
    compute_cost = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost_clean, error_rate_clean])

    # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))

    def evaluate(mode):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches

        set_cost = 0.
        set_error_rate = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _error = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \
                             1.0 / batches_seen * _error

        return set_cost, set_error_rate

    print("Done. Evaluating scratch model ...")
    dev_set_cost, dev_set_error = evaluate('dev')
    print("BEFORE TRAINING: dev cost %f, error %f" %
          (dev_set_cost, dev_set_error))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_error = 0.
            start = time.time()

            for batches_seen, (hypo, hm, premise, pm,
                               truth) in enumerate(train_batches, 1):
                _cost, _error = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \
                                  1.0 / batches_seen * _error
                if (batches_seen * BSIZE) % 5000 == 0:
                    end = time.time()
                    print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" %
                          (batches_seen * BSIZE, end - start, LR,
                           train_set_cost, train_set_error))
                    start = end

                if (batches_seen * BSIZE) % 100000 == 0:
                    dev_set_cost, dev_set_error = evaluate('dev')
                    print("***dev cost %f, error %f" %
                          (dev_set_cost, dev_set_error))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(
                all_param_values,
                open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            dev_set_cost, dev_set_error = evaluate('dev')
            test_set_cost, test_set_error = evaluate('test')

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         error train %f dev %f test %f" %
                  (epoch, train_set_cost, dev_set_cost, test_set_cost,
                   train_set_error, dev_set_error, test_set_error))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass
示例#9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--fold', required=True)
    parser.add_argument('--baseline', default='results/baseline.pt')
    parser.add_argument('--pnoth',
                        default=False,
                        action='store_true',
                        help='reduce premise instead of hypothesis')
    parser.add_argument('--truth',
                        default=False,
                        action='store_true',
                        help='use label instead of prediction as target')
    args = parser.parse_args()

    data = SNLI(conf)
    conf.char_vocab_size = len(data.char_vocab)
    conf.word_vocab_size = len(data.TEXT.vocab)
    conf.class_size = len(data.LABEL.vocab)
    conf.max_word_len = data.max_word_len
    q_vocab = data.TEXT.vocab.itos
    a_vocab = data.LABEL.vocab.itos

    out_dir = prepare_output_dir(conf, 'results', 'rawr')
    print('Generating [{}] rawr data from [{}].'.format(
        args.fold, args.baseline))
    print(out_dir)

    model = BIMPM(conf, data)
    model.load_state_dict(torch.load(args.baseline))
    model.word_emb.weight.requires_grad = True
    model.to(conf.device)

    datasets = {'train': data.train_iter, 'dev': data.dev_iter}

    if args.pnoth:
        fname = 'rawr.{}.premise.pkl'.format(args.fold)
    else:
        fname = 'rawr.{}.hypothesis.pkl'.format(args.fold)

    checkpoint = []
    for batch_i, batch in enumerate(tqdm(datasets[args.fold])):
        if batch_i > len(datasets[args.fold]):
            # otherwise train iter will loop forever!
            break
        batch_size = batch.hypothesis.shape[0]
        model.eval()
        output = F.softmax(model(batch.premise, batch.hypothesis), 1)
        original_scores, original_predictions = torch.max(output, 1)
        original_scores = original_scores.detach().cpu().numpy()
        original_predictions = original_predictions.detach().cpu().numpy()
        batch_cpu = Batch(batch.premise.data.cpu(),
                          batch.hypothesis.data.cpu(), batch.label.data.cpu())

        reduced, removed_indices = get_rawr(
            model,
            batch,
            max_beam_size=rawr_conf.max_beam_size,
            conf_threshold=rawr_conf.conf_threshold,
            p_not_h=args.pnoth)
        for i in range(batch_size):
            og = {
                'premise': batch_cpu.premise[i],
                'hypothesis': batch_cpu.hypothesis[i],
                'premise_readable': to_text(batch_cpu.premise[i], q_vocab),
                'hypothesis_readable': to_text(batch_cpu.hypothesis[i],
                                               q_vocab),
                'prediction': original_predictions[i],
                'prediction_readable': a_vocab[original_predictions[i]],
                'score': original_scores[i],
                'label': batch_cpu.label[i],
                'label_readable': a_vocab[batch_cpu.label[i]]
            }
            checkpoint.append({'original': og, 'reduced': []})
            s1 = batch.hypothesis[i] if args.pnoth else batch.premise[i]
            s1 = s1.to(conf.device)
            for j, s2 in enumerate(reduced[i]):
                s2 = torch.LongTensor(s2).to(conf.device)
                model.eval()
                if args.pnoth:
                    output = model(s2.unsqueeze(0), s1.unsqueeze(0))
                else:
                    output = model(s1.unsqueeze(0), s2.unsqueeze(0))
                output = F.softmax(output, 1)
                pred_scores, pred = torch.max(output, 1)
                pred = pred.detach().cpu().numpy()[0]
                pred_scores = pred_scores.detach().cpu().numpy()[0]
                if args.pnoth:
                    hypo, prem = s1.cpu(), s2.cpu()
                else:
                    prem, hypo = s1.cpu(), s2.cpu()
                checkpoint[-1]['reduced'].append({
                    'premise':
                    prem,
                    'hypothesis':
                    hypo,
                    'premise_readable':
                    to_text(prem, q_vocab),
                    'hypothesis_readable':
                    to_text(hypo, q_vocab),
                    'prediction':
                    pred,
                    'prediction_readable':
                    a_vocab[pred],
                    'score':
                    pred_scores,
                    'label':
                    batch_cpu.label[i],
                    'label_readable':
                    a_vocab[batch_cpu.label[i]],
                    'removed_indices':
                    removed_indices[i][j],
                    'which_reduced':
                    'premise' if args.pnoth else 'hypothesis'
                })
        if batch_i % 1000 == 0 and batch_i > 0:
            out_path = os.path.join(out_dir, '{}.{}'.format(fname, batch_i))
            with open(out_path, 'wb') as f:
                pickle.dump(checkpoint, f)
            checkpoint = []

    if len(checkpoint) > 0:
        out_path = os.path.join(out_dir, '{}.{}'.format(fname, batch_i))
        with open(out_path, 'wb') as f:
            pickle.dump(checkpoint, f)
示例#10
0
def main():
    from args import conf

    parser = argparse.ArgumentParser()
    parser.add_argument('--train', default='results/rawr.train.hypothesis.pkl')
    parser.add_argument('--dev', default='results/rawr.dev.hypothesis.pkl')
    parser.add_argument('--truth',
                        default=False,
                        action='store_true',
                        help='use label instead of prediction as target')
    parser.add_argument('--ogdev',
                        default=False,
                        action='store_true',
                        help='use original dev set instead of reduced')
    parser.add_argument('--full',
                        default=0,
                        type=float,
                        help='amount of full examples to include')
    args = parser.parse_args()

    conf.train_data = args.train
    conf.dev_data = args.dev

    print('loading regular data...')
    regular_data = SNLI(conf)
    conf.char_vocab_size = len(regular_data.char_vocab)
    conf.word_vocab_size = len(regular_data.TEXT.vocab)
    conf.class_size = len(regular_data.LABEL.vocab)
    conf.max_word_len = regular_data.max_word_len
    conf.out_dir = prepare_output_dir(conf, 'results', 'reduced')

    print('loading reduced data from [{}]'.format(conf.train_data))
    with open(conf.train_data, 'rb') as f:
        train = pickle.load(f)
    print('loading reduced data from [{}]'.format(conf.dev_data))
    with open(conf.dev_data, 'rb') as f:
        dev = pickle.load(f)

    train_label = 'label' if args.truth else 'prediction'
    train = [(x['premise'], x['hypothesis'], ex['original'][train_label])
             for ex in train for x in ex['reduced']]
    # dev = [(x['premise'], x['hypothesis'], x['label'])
    #        for ex in dev for x in ex['reduced']]
    dev = [(x['premise'], x['hypothesis'], x['label']) for ex in dev
           for x in ex['reduced'][:1]]

    train_batches = batchify(train, conf.batch_size)

    if args.full > 0:
        n_examples = int(len(regular_data.train_iter) * args.full)
        print('use {} ({}) full training data'.format(
            n_examples * conf.batch_size, args.full))
        full_batches = []
        for j, x in enumerate(regular_data.train_iter):
            if j > n_examples:
                break
            full_batches.append((x.premise, x.hypothesis, x.label))
        # train_batches += full_batches
        train_batches = full_batches

    print(len(train_batches))

    if args.ogdev:
        dev_batches = list(regular_data.dev_iter)
        dev_batches = [(x.premise, x.hypothesis, x.label) for x in dev_batches]
    else:
        dev_batches = batchify(train, conf.batch_size)

    model = BIMPM(conf, regular_data)
    if conf.gpu > -1:
        model.cuda(conf.gpu)

    print('begin training')
    best_model = train_reduced(model, train_batches, dev_batches, conf)

    torch.save(best_model.state_dict(), os.path.join(conf.out_dir, 'best.pt'))
    print('training finished!')
def main(num_epochs=NEPOCH):
    print("Loading data ...")
    snli = SNLI(batch_size=BSIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    del snli

    print("Building network ...")
    ########### sentence embedding encoder ###########
    # sentence vector, with each number standing for a word number
    input_var = T.TensorType('int32', [False, False])('sentence_vector')
    input_var.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 20),
                              'int32'), numpy.zeros(
                                  (BSIZE, 5)).astype('int32')))
    input_var.tag.test_value[1, 20:22] = (413, 45)
    l_in = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var)

    input_mask = T.TensorType('int32', [False, False])('sentence_mask')
    input_mask.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 20), dtype='int32'), numpy.zeros((BSIZE, 5), dtype='int32')))
    input_mask.tag.test_value[1, 20:22] = 1
    l_mask = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_mask)

    # output shape (BSIZE, None, WEDIM)
    l_word_embed = lasagne.layers.EmbeddingLayer(
        l_in,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)

    # bidirectional LSTM
    l_forward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTMHID,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GCLIP)

    l_backward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTMHID,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GCLIP,
        backwards=True)

    # output dim: (BSIZE, None, 2*LSTMHID)
    l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2)
    l_concat_dpout = lasagne.layers.DropoutLayer(
        l_concat, p=DPOUT, rescale=True)  # might not need this line

    # Attention mechanism to get sentence embedding
    # output dim: (BSIZE, None, ATTHID)
    l_ws1 = DenseLayer3DInput(l_concat_dpout, num_units=ATTHID)
    l_ws1_dpout = lasagne.layers.DropoutLayer(l_ws1, p=DPOUT, rescale=True)

    # output dim: (BSIZE, None, NROW)
    l_ws2 = DenseLayer3DInput(l_ws1_dpout, num_units=NROW, nonlinearity=None)
    l_annotations = Softmax3D(l_ws2, mask=l_mask)
    # output dim: (BSIZE, 2*LSTMHID, NROW)
    l_sentence_embedding = ApplyAttention([l_annotations, l_concat])

    # beam search? Bi lstm in the sentence embedding layer? etc.

    ########### get embeddings for hypothesis and premise ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 18),
                              'int32'), numpy.zeros(
                                  (BSIZE, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_h)

    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_h)

    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 16),
                              'int32'), numpy.zeros(
                                  (BSIZE, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_p)

    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_p)

    hypothesis_embedding, hypothesis_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_h.input_var,
            l_mask: l_mask_h.input_var
        })
    premise_embedding, premise_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_p.input_var,
            l_mask: l_mask_p.input_var
        })

    hypothesis_embedding_clean, hypothesis_annotation_clean = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_h.input_var,
            l_mask: l_mask_h.input_var
        },
        deterministic=True)
    premise_embedding_clean, premise_annotation_clean = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_p.input_var,
            l_mask: l_mask_p.input_var
        },
        deterministic=True)

    ########### gated encoder and output MLP ##########
    l_hypo_embed = lasagne.layers.InputLayer(shape=(BSIZE, NROW, 2 * LSTMHID),
                                             input_var=hypothesis_embedding)
    l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_embed,
                                                     p=DPOUT,
                                                     rescale=True)
    l_pre_embed = lasagne.layers.InputLayer(shape=(BSIZE, NROW, 2 * LSTMHID),
                                            input_var=premise_embedding)
    l_pre_embed_dpout = lasagne.layers.DropoutLayer(l_pre_embed,
                                                    p=DPOUT,
                                                    rescale=True)

    # output dim: (BSIZE, NROW, 2*LSTMHID)
    l_factors = GatedEncoder3D([l_hypo_embed_dpout, l_pre_embed_dpout],
                               num_hfactors=2 * LSTMHID)
    l_factors_dpout = lasagne.layers.DropoutLayer(l_factors,
                                                  p=DPOUT,
                                                  rescale=True)

    # l_hids = DenseLayer3DWeight()

    l_outhid = lasagne.layers.DenseLayer(
        l_factors_dpout,
        num_units=OUTHID,
        nonlinearity=lasagne.nonlinearities.rectify)
    l_outhid_dpout = lasagne.layers.DropoutLayer(l_outhid,
                                                 p=DPOUT,
                                                 rescale=True)

    l_output = lasagne.layers.DenseLayer(
        l_outhid_dpout,
        num_units=3,
        nonlinearity=lasagne.nonlinearities.softmax)

    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([
        1,
    ] * BSIZE, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_prediction = T.argmax(network_output, axis=1)
    accuracy = T.mean(T.eq(network_prediction, target_values))

    network_output_clean = lasagne.layers.get_output(
        l_output, {
            l_hypo_embed: hypothesis_embedding_clean,
            l_pre_embed: premise_embedding_clean
        },
        deterministic=True)
    network_prediction_clean = T.argmax(network_output_clean, axis=1)
    accuracy_clean = T.mean(T.eq(network_prediction_clean, target_values))

    # penalty term and cost
    attention_penalty = T.mean(
        (T.batched_dot(hypothesis_annotation,
                       hypothesis_annotation.dimshuffle(0, 2, 1)) -
         T.eye(hypothesis_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
        axis=(0, 1, 2)) + T.mean(
            (T.batched_dot(premise_annotation,
                           premise_annotation.dimshuffle(0, 2, 1)) -
             T.eye(premise_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
            axis=(0, 1, 2))

    L2_lstm = ((l_forward.W_in_to_ingate ** 2).sum() + \
               (l_forward.W_hid_to_ingate ** 2).sum() + \
               (l_forward.W_in_to_forgetgate ** 2).sum() + \
               (l_forward.W_hid_to_forgetgate ** 2).sum() + \
               (l_forward.W_in_to_cell ** 2).sum() + \
               (l_forward.W_hid_to_cell ** 2).sum() + \
               (l_forward.W_in_to_outgate ** 2).sum() + \
               (l_forward.W_hid_to_outgate ** 2).sum() + \
               (l_backward.W_in_to_ingate ** 2).sum() + \
               (l_backward.W_hid_to_ingate ** 2).sum() + \
               (l_backward.W_in_to_forgetgate ** 2).sum() + \
               (l_backward.W_hid_to_forgetgate ** 2).sum() + \
               (l_backward.W_in_to_cell ** 2).sum() + \
               (l_backward.W_hid_to_cell ** 2).sum() + \
               (l_backward.W_in_to_outgate ** 2).sum() + \
               (l_backward.W_hid_to_outgate ** 2).sum())
    L2_attention = (l_ws1.W**2).sum() + (l_ws2.W**2).sum()
    L2_gae = (l_factors.Wxf**2).sum() + (l_factors.Wyf**2).sum()
    L2_outputhid = (l_outhid.W**2).sum()
    L2_softmax = (l_output.W**2).sum()
    L2 = L2_lstm + L2_attention + L2_gae + L2_outputhid + L2_softmax

    cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values)) + \
           L2REG * L2
    cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values)) + \
                 L2REG * L2
    if ATTPENALTY != 0.:
        cost = cost + ATTPENALTY * attention_penalty
        cost_clean = cost_clean + ATTPENALTY * attention_penalty

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output) + \
                 lasagne.layers.get_all_params(l_sentence_embedding)
    if not UPDATEWE:
        all_params.remove(l_word_embed.W)

    numparams = sum(
        [numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams))
    print("-----------------------------------------------------------------")
    for item in all_params:
        print("{0:24}{1:24}{2}".format(item, item.shape.eval(),
                                       numpy.prod(item.shape.eval())))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)

    # Compute SGD updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LR)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost, accuracy],
                            updates=updates)
    compute_cost = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost_clean, accuracy_clean])
    predict = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var
    ], network_prediction_clean)

    def evaluate(mode, verbose=False):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches

        set_cost = 0.
        set_accuracy = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _accuracy = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_accuracy = (1.0 - 1.0 / batches_seen) * set_accuracy + \
                             1.0 / batches_seen * _accuracy

        if verbose == True:
            predicted = []
            truth = []
            for batches_seen, (hypo, hm, premise, pm,
                               th) in enumerate(data, 1):
                predicted.append(predict(hypo, hm, premise, pm))
                truth.append(th)
            truth = numpy.concatenate(truth)
            predicted = numpy.concatenate(predicted)
            cm = confusion_matrix(truth, predicted)
            pr_a = cm.trace() * 1.0 / truth.size
            pr_e = ((cm.sum(axis=0)*1.0/truth.size) * \
                    (cm.sum(axis=1)*1.0/truth.size)).sum()
            k = (pr_a - pr_e) / (1 - pr_e)
            print(mode + " set statistics:")
            print("kappa index of agreement: %f" % k)
            print("confusion matrix:")
            print(cm)

        return set_cost, set_accuracy

    print("Done. Evaluating scratch model ...")
    test_set_cost, test_set_accuracy = evaluate('test', verbose=True)
    print("BEFORE TRAINING: dev cost %f, accuracy %f" %
          (test_set_cost, test_set_accuracy))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_accuracy = 0.
            start = time.time()

            for batches_seen, (hypo, hm, premise, pm,
                               truth) in enumerate(train_batches, 1):
                _cost, _accuracy = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_accuracy = (1.0 - 1.0 / batches_seen) * train_set_accuracy + \
                                  1.0 / batches_seen * _accuracy
                if batches_seen % 100 == 0:
                    end = time.time()
                    print(
                        "Sample %d %.2fs, lr %.4f, train cost %f, accuracy %f"
                        % (batches_seen * BSIZE, end - start, LR,
                           train_set_cost, train_set_accuracy))
                    start = end

                if batches_seen % 2000 == 0:
                    dev_set_cost, dev_set_accuracy = evaluate('dev')
                    print("***dev cost %f, accuracy %f" %
                          (dev_set_cost, dev_set_accuracy))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(
                all_param_values,
                open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            dev_set_cost, dev_set_accuracy = evaluate('dev')
            test_set_cost, test_set_accuracy = evaluate('test', verbose=True)

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         accu: train %f dev %f test %f" %
                  (epoch, train_set_cost, dev_set_cost, test_set_cost,
                   train_set_accuracy, dev_set_accuracy, test_set_accuracy))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass