Exemplo n.º 1
0
        def process_one_instance(instance,
                                 update=True,
                                 x_y_vectors=None,
                                 features=None,
                                 mode='train'):
            lemma_lookup = self.model_parameters['lemma_lookup']
            if self.opt['use_path']:
                pos_lookup = self.model_parameters['pos_lookup']
                dep_lookup = self.model_parameters['dep_lookup']
                dir_lookup = self.model_parameters['dir_lookup']
                # Add the empty path
                paths = instance
                if len(paths) == 0:
                    paths[EMPTY_PATH] = 1

                # Compute the averaged path
                num_paths = reduce(lambda x, y: x + y, instance.itervalues())
                path_embeddings = [
                    self.get_path_embedding_from_cache(
                        lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path,
                        update, mode) * count
                    for path, count in instance.iteritems()
                ]
                input_vec = dy.esum(path_embeddings) * (1.0 / num_paths)

            # Concatenate x and y embeddings
            if self.opt['use_xy_embeddings']:
                x_vector, y_vector = dy.lookup(lemma_lookup,
                                               x_y_vectors[0]), dy.lookup(
                                                   lemma_lookup,
                                                   x_y_vectors[1])
                if self.opt['use_path']:
                    input_vec = dy.concatenate([x_vector, input_vec, y_vector])
                else:
                    input_vec = dy.concatenate([x_vector, y_vector])
            if self.opt['use_features']:
                for k in feat_dims:
                    if 'diff' in k and not self.opt['use_freq_features']:
                        continue
                    feat = dy.lookup(self.model_parameters[k], features[k])
                    input_vec = dy.concatenate([input_vec, feat])

            if self.opt['use_height_ebd']:
                if j in tree.term_height:
                    h = tree.get_height(j) - 1
                else:
                    h = 0
                height_vector = dy.lookup(
                    self.model_parameters['height_lookup'], h)
                input_vec = dy.concatenate([input_vec, height_vector])
            return input_vec
Exemplo n.º 2
0
def word_dropout(lookup_table, word, rate, update=True):
    """
    Apply word dropout with dropout rate
    :param exp: expression vector
    :param rate: dropout rate
    :return:
    """
    new_word = np.random.choice([word, 0], size=1, p=[1 - rate, rate])[0]
    return dy.lookup(lookup_table, new_word, update)
Exemplo n.º 3
0
    def set_initial_states(self, x):
        self.xt_embs = [dy.lookup(self.F, x_t) for x_t in x]

        if self.encoder_type == 'bow':
            self.W_enc = self.W * dy.average(self.xt_embs)

        elif self.encoder_type == 'attention':
            self.xb = dy.concatenate([
                dy.esum(self.xt_embs[max(i - self.q, 0
                                         ):min(len(x) - 1 + 1, i + self.q +
                                               1)]) / self.q
                for i in range(len(x))
            ],
                                     d=1)
            self.xt = dy.transpose(dy.concatenate(self.xt_embs, d=1))
Exemplo n.º 4
0
    def __call__(self, x=None, t=None, test=False):
        if test:
            tt_embs = [dy.lookup(self.E, t_t) for t_t in t]

            if self.encoder_type == 'bow':
                # Neural language model
                tt_c = dy.concatenate(tt_embs)
                h = dy.tanh(self.U * tt_c)

                # Output with softmax
                y_t = dy.softmax(self.V * h + self.W_enc)

            elif self.encoder_type == 'attention':
                ttp_embs = [dy.lookup(self.G, t_t) for t_t in t]

                # Neural language model
                tt_c = dy.concatenate(tt_embs)
                h = dy.tanh(self.U * tt_c)

                # Attention
                ttp_c = dy.concatenate(ttp_embs)
                p = dy.softmax(self.xt * self.P * ttp_c)  # Attention weight
                enc = self.xb * p  # Context vector

                # Output with softmax
                y_t = dy.softmax(self.V * h + self.W * enc)

            return y_t

        else:
            xt_embs = [dy.lookup(self.F, x_t) for x_t in x]
            tt_embs = [dy.lookup(self.E, t_t) for t_t in t]

            y = []
            if self.encoder_type == 'bow':
                # BoW
                enc = dy.average(xt_embs)
                W_enc = self.W * enc
                for i in range(len(t) - self.c + 1):
                    # Neural language model
                    tt_c = dy.concatenate(tt_embs[i:i + self.c])
                    h = dy.tanh(self.U * tt_c)

                    # Output without softmax
                    y_t = self.V * h + W_enc
                    y.append(y_t)

            elif self.encoder_type == 'attention':
                xb = dy.concatenate([
                    dy.esum(xt_embs[max(i - self.q, 0
                                        ):min(len(x) - 1 + 1, i + self.q + 1)])
                    / self.q for i in range(len(x))
                ],
                                    d=1)
                xt = dy.transpose(dy.concatenate(xt_embs, d=1))
                ttp_embs = [dy.lookup(self.G, t_t) for t_t in t]

                for i in range(len(t) - self.c + 1):
                    # Neural language model
                    tt_c = dy.concatenate(tt_embs[i:i + self.c])
                    h = dy.tanh(self.U * tt_c)

                    # Attention
                    ttp_c = dy.concatenate(
                        ttp_embs[i:i + self.c])  # Window-sized embedding
                    p = dy.softmax(xt * self.P * ttp_c)  # Attention weight
                    enc = xb * p  # Context vector

                    # Output without softmax
                    y_t = self.V * h + self.W * enc
                    y.append(y_t)

            return y
Exemplo n.º 5
0
def process_one_instance(builder,
                         model,
                         model_parameters,
                         instance,
                         path_cache,
                         update=True,
                         dropout=0.0,
                         x_y_vectors=None,
                         num_hidden_layers=0):
    """
    Return the LSTM output vector of a single term-pair - the average path embedding
    :param builder: the LSTM builder
    :param model: the LSTM model
    :param model_parameters: the model parameters
    :param instance: a Counter object with paths
    :param path_cache: the cache for path embeddings
    :param update: whether to update the lemma embeddings
    :param dropout: word dropout rate
    :param x_y_vectors: the current word vectors for x and y
    :param num_hidden_layers The number of hidden layers for the term-pair classification network
    :return: the LSTM output vector of a single term-pair
    """
    W1 = dy.parameter(model_parameters['W1'])
    b1 = dy.parameter(model_parameters['b1'])
    W2 = None
    b2 = None

    if num_hidden_layers == 1:
        W2 = dy.parameter(model_parameters['W2'])
        b2 = dy.parameter(model_parameters['b2'])

    lemma_lookup = model_parameters['lemma_lookup']
    pos_lookup = model_parameters['pos_lookup']
    dep_lookup = model_parameters['dep_lookup']
    dir_lookup = model_parameters['dir_lookup']

    # Use the LSTM output vector and feed it to the MLP

    # Add the empty path
    paths = instance

    if len(paths) == 0:
        paths[EMPTY_PATH] = 1

    # Compute the averaged path
    num_paths = reduce(lambda x, y: x + y, instance.itervalues())
    path_embbedings = [
        get_path_embedding_from_cache(path_cache, builder, lemma_lookup,
                                      pos_lookup, dep_lookup, dir_lookup, path,
                                      update, dropout) * count
        for path, count in instance.iteritems()
    ]
    input_vec = dy.esum(path_embbedings) * (1.0 / num_paths)

    # Concatenate x and y embeddings
    if x_y_vectors is not None:
        x_vector, y_vector = dy.lookup(lemma_lookup,
                                       x_y_vectors[0]), dy.lookup(
                                           lemma_lookup, x_y_vectors[1])
        input_vec = dy.concatenate([x_vector, input_vec, y_vector])

    h = W1 * input_vec + b1

    if num_hidden_layers == 1:
        h = W2 * dy.tanh(h) + b2

    output = dy.softmax(h)

    return output
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(description='Selective Encoding for Abstractive Sentence Summarization in DyNet')

    parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--n_epochs', type=int, default=3, help='Number of epochs [default: 3]')
    parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]')
    parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651])')
    parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]')
    parser.add_argument('--vocab_size', type=int, default=124404, help='Vocabulary size [default: 124404]')
    parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]')
    parser.add_argument('--maxout_dim', type=int, default=2, help='Maxout size [default: 2]')
    parser.add_argument('--alloc_mem', type=int, default=10000, help='Amount of memory to allocate [mb] [default: 10000]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS   = args.n_epochs
    N_TRAIN    = args.n_train
    N_VALID    = args.n_valid
    BATCH_SIZE = args.batch_size
    VOCAB_SIZE = args.vocab_size
    EMB_DIM    = args.emb_dim
    HID_DIM    = args.hid_dim
    MAXOUT_DIM = args.maxout_dim
    ALLOC_MEM  = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_SEED)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset
    dataset = Dataset(
        TRAIN_X_FILE,
        TRAIN_Y_FILE,
        VALID_X_FILE,
        VALID_Y_FILE,
        vocab_size=VOCAB_SIZE,
        batch_size=BATCH_SIZE,
        n_train=N_TRAIN,
        n_valid=N_VALID
    )
    VOCAB_SIZE = len(dataset.w2i)
    print('VOCAB_SIZE', VOCAB_SIZE)

    # Build model
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))
    encoder = SelectiveBiGRU(model, EMB_DIM, HID_DIM)
    decoder = AttentionalGRU(model, EMB_DIM, HID_DIM, MAXOUT_DIM, VOCAB_SIZE)

    # Train model
    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        loss_all_train = []
        dataset.reset_train_iter()
        for train_x_mb, train_y_mb in tqdm(dataset.train_iter):
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters([encoder, decoder])
            losses = []
            for x, t in zip(train_x_mb, train_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                hp, hb_1 = encoder(x_embs)

                # Decoder
                decoder.set_initial_states(hp, hb_1)
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                y = decoder(t_embs)

                # Loss
                loss = dy.esum(
                    [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]
                )
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        dataset.reset_valid_iter()
        for valid_x_mb, valid_y_mb in dataset.valid_iter:
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters([encoder, decoder])
            losses = []
            for x, t in zip(valid_x_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                hp, hb_1 = encoder(x_embs)

                # Decoder
                decoder.set_initial_states(hp, hb_1)
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                y = decoder(t_embs)

                # Loss
                loss = dy.esum(
                    [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]
                )
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f, Time: %.3f[s]' % (
            epoch+1,
            np.mean(loss_all_train),
            np.mean(loss_all_valid),
            time.time()-start_time
        ))

        # Save model
        dy.save('./model_e'+str(epoch+1), [V, encoder, decoder])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w:
            pickle.dump(dataset.w2i, f_w2i)
            pickle.dump(dataset.i2w, f_i2w)
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Deep Recurrent Generative Decoder for Abstractive Text Summarization in DyNet'
    )

    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='GPU ID to use. For cpu, set -1 [default: -]')
    parser.add_argument('--n_test',
                        type=int,
                        default=189651,
                        help='Number of test examples [default: 189651]')
    parser.add_argument('--beam_size',
                        type=int,
                        default=5,
                        help='Beam size [default: 5]')
    parser.add_argument('--max_len',
                        type=int,
                        default=100,
                        help='Maximum length of decoding [default: 100]')
    parser.add_argument('--model_file',
                        type=str,
                        default='./model_e1',
                        help='Trained model file path [default: ./model_e1]')
    parser.add_argument(
        '--input_file',
        type=str,
        default='./data/valid.article.filter.txt',
        help='Test file path [default: ./data/valid.article.filter.txt]')
    parser.add_argument('--output_file',
                        type=str,
                        default='./pred_y.txt',
                        help='Output file path [default: ./pred_y.txt]')
    parser.add_argument('--w2i_file',
                        type=str,
                        default='./w2i.dump',
                        help='Word2Index file path [default: ./w2i.dump]')
    parser.add_argument('--i2w_file',
                        type=str,
                        default='./i2w.dump',
                        help='Index2Word file path [default: ./i2w.dump]')
    parser.add_argument(
        '--alloc_mem',
        type=int,
        default=1024,
        help='Amount of memory to allocate [mb] [default: 1024]')
    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_TEST = args.n_test
    K = args.beam_size
    MAX_LEN = args.max_len
    ALLOC_MEM = args.alloc_mem

    # File paths
    MODEL_FILE = args.model_file
    INPUT_FILE = args.input_file
    OUTPUT_FILE = args.output_file
    W2I_FILE = args.w2i_file
    I2W_FILE = args.i2w_file

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_SEED)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Load trained model ==============================================================================================
    with open(W2I_FILE, 'rb') as f_w2i, open(I2W_FILE, 'rb') as f_i2w:
        w2i = pickle.load(f_w2i)
        i2w = pickle.load(f_i2w)

    test_X, _, _ = build_dataset(INPUT_FILE,
                                 w2i=w2i,
                                 n_data=N_TEST,
                                 target=False)

    model = dy.Model()
    V, encoder, decoder = dy.load(MODEL_FILE, model)

    # Decode
    pred_y = []
    for x in tqdm(test_X):
        dy.renew_cg()
        associate_parameters([encoder, decoder])

        # Initial states
        x_embs = [dy.lookup(V, x_t) for x_t in x]
        hp, hb_1 = encoder(x_embs)
        decoder.set_initial_states(hp, hb_1)
        s_0, c_0 = decoder.s_0, decoder.c_0

        # candidates
        candidates = [[0, w2i['<s>'], s_0, c_0, []]]

        t = 0
        while t < MAX_LEN:
            t += 1
            tmp_candidates = []
            end_flag = True
            for score_tm1, y_tm1, s_tm1, c_tm1, y_02tm1 in candidates:
                if y_tm1 == w2i['</s>']:
                    tmp_candidates.append(
                        [score_tm1, y_tm1, s_tm1, c_tm1, y_02tm1])
                else:
                    end_flag = False
                    y_tm1_emb = dy.lookup(V, y_tm1)
                    s_t, c_t, _q_t = decoder(y_tm1_emb,
                                             tm1s=[s_tm1, c_tm1],
                                             test=True)
                    _q_t = np.log(_q_t.npvalue())  # Calculate log probs
                    q_t, y_t = np.sort(_q_t)[::-1][:K], np.argsort(
                        _q_t
                    )[::-1][:K]  # Pick K highest log probs and their ids
                    score_t = score_tm1 + q_t  # Accumulate log probs
                    tmp_candidates.extend(
                        [[score_tk, y_tk, s_t, c_t, y_02tm1 + [y_tk]]
                         for score_tk, y_tk in zip(score_t, y_t)])
            if end_flag:
                break
            candidates = sorted(
                tmp_candidates, key=lambda x: -x[0] / len(x[-1])
            )[:K]  # Sort in normalized log probs and pick K highest candidates

        # Pick the candidate with the highest score
        pred = candidates[0][-1]
        if w2i['</s>'] in pred:
            pred.remove(w2i['</s>'])
        pred_y.append(pred)

    pred_y_txt = ''
    for pred in pred_y:
        pred_y_txt += ' '.join([i2w[com] for com in pred]) + '\n'

    with open(OUTPUT_FILE, 'w') as f:
        f.write(pred_y_txt)
Exemplo n.º 8
0
 def word_dropout(self, lookup_table, word, update=True, mode='train'):
     if mode == 'train':
         if word != 0 and np.random.random(
         ) < self.opt['word_dropout_rate']:
             word = 0
     return dy.lookup(lookup_table, word, update)
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(description='Convolutional Neural Networks for Sentence Classification in DyNet')

    parser.add_argument('--gpu', type=int, default=-1, help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--model_file', type=str, default='./model', help='Model to use for prediction [default: ./model]')
    parser.add_argument('--input_file', type=str, default='./data/valid_x.txt', help='Input file path [default: ./data/valid_x.txt]')
    parser.add_argument('--output_file', type=str, default='./pred_y.txt', help='Output file path [default: ./pred_y.txt]')
    parser.add_argument('--w2i_file', type=str, default='./w2i.dump', help='Word2Index file path [default: ./w2i.dump]')
    parser.add_argument('--i2w_file', type=str, default='./i2w.dump', help='Index2Word file path [default: ./i2w.dump]')
    parser.add_argument('--alloc_mem', type=int, default=1024, help='Amount of memory to allocate [mb] [default: 1024]')
    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    MODEL_FILE = args.model_file
    INPUT_FILE = args.input_file
    OUTPUT_FILE = args.output_file
    W2I_FILE = args.w2i_file
    I2W_FILE = args.i2w_file
    ALLOC_MEM = args.alloc_mem

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Load model
    model = dy.Model()
    pretrained_model = dy.load(MODEL_FILE, model)
    if len(pretrained_model) == 3:
        V1, layers = pretrained_model[0], pretrained_model[1:]
        MULTICHANNEL = False
    else:
        V1, V2, layers = pretrained_model[0], pretrained_model[1], pretrained_model[2:]
        MULTICHANNEL = True

    EMB_DIM = V1.shape()[0]
    WIN_SIZES = layers[0].win_sizes

    # Load test data
    with open(W2I_FILE, 'rb') as f_w2i, open(I2W_FILE, 'rb') as f_i2w:
        w2i = pickle.load(f_w2i)
        i2w = pickle.load(f_i2w)

    max_win = max(WIN_SIZES)
    test_X, _, _ = build_dataset(INPUT_FILE, w2i=w2i, unksym='unk')
    test_X = [[0]*max_win + instance_x + [0]*max_win for instance_x in test_X]

    # Pred
    pred_y = []
    for instance_x in tqdm(test_X):
        # Create a new computation graph
        dy.renew_cg()
        associate_parameters(layers)

        sen_len = len(instance_x)

        if MULTICHANNEL:
            x_embs1 = dy.concatenate([dy.lookup(V1, x_t, update=False) for x_t in instance_x], d=1)
            x_embs2 = dy.concatenate([dy.lookup(V2, x_t, update=False) for x_t in instance_x], d=1)
            x_embs1 = dy.transpose(x_embs1)
            x_embs2 = dy.transpose(x_embs2)
            x_embs = dy.concatenate([x_embs1, x_embs2], d=2)
        else:
            x_embs = dy.concatenate([dy.lookup(V1, x_t, update=False) for x_t in instance_x], d=1)
            x_embs = dy.transpose(x_embs)
            x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1))

        y = f_props(layers, x_embs, train=False)
        pred_y.append(str(int(binary_pred(y.value()))))

    with open(OUTPUT_FILE, 'w') as f:
        f.write('\n'.join(pred_y))
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Deep Recurrent Generative Decoder for Abstractive Text Summarization in DyNet'
    )

    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--n_epochs',
                        type=int,
                        default=3,
                        help='Number of epochs [default: 3]')
    parser.add_argument(
        '--n_train',
        type=int,
        default=3803957,
        help=
        'Number of training examples (up to 3803957 in gigaword) [default: 3803957]'
    )
    parser.add_argument(
        '--n_valid',
        type=int,
        default=189651,
        help=
        'Number of validation examples (up to 189651 in gigaword) [default: 189651])'
    )
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Mini batch size [default: 32]')
    parser.add_argument('--emb_dim',
                        type=int,
                        default=256,
                        help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim',
                        type=int,
                        default=256,
                        help='Hidden state size [default: 256]')
    parser.add_argument('--lat_dim',
                        type=int,
                        default=256,
                        help='Latent size [default: 256]')
    parser.add_argument(
        '--alloc_mem',
        type=int,
        default=8192,
        help='Amount of memory to allocate [mb] [default: 8192]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS = args.n_epochs
    N_TRAIN = args.n_train
    N_VALID = args.n_valid
    BATCH_SIZE = args.batch_size
    VOCAB_SIZE = 60000
    EMB_DIM = args.emb_dim
    HID_DIM = args.hid_dim
    LAT_DIM = args.lat_dim
    ALLOC_MEM = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_STATE)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset ====================================================================================
    w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN)
    w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN)

    train_X, w2i, i2w = build_dataset(TRAIN_X_FILE,
                                      w2c=w2c,
                                      padid=False,
                                      eos=True,
                                      unksym='<unk>',
                                      target=False,
                                      n_data=N_TRAIN,
                                      vocab_size=VOCAB_SIZE)
    train_y, _, _ = build_dataset(TRAIN_Y_FILE,
                                  w2i=w2i,
                                  target=True,
                                  n_data=N_TRAIN)

    valid_X, _, _ = build_dataset(VALID_X_FILE,
                                  w2i=w2i,
                                  target=False,
                                  n_data=N_VALID)
    valid_y, _, _ = build_dataset(VALID_Y_FILE,
                                  w2i=w2i,
                                  target=True,
                                  n_data=N_VALID)

    VOCAB_SIZE = len(w2i)
    OUT_DIM = VOCAB_SIZE
    print(VOCAB_SIZE)

    # Build model ======================================================================================
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))

    encoder = BiGRU(model, EMB_DIM, 2 * HID_DIM)
    decoder = RecurrentGenerativeDecoder(model, EMB_DIM, 2 * HID_DIM, LAT_DIM,
                                         OUT_DIM)

    # Train model =======================================================================================
    n_batches_train = math.ceil(len(train_X) / BATCH_SIZE)
    n_batches_valid = math.ceil(len(valid_X) / BATCH_SIZE)

    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        train_X, train_y = shuffle(train_X, train_y)
        loss_all_train = []
        for i in tqdm(range(n_batches_train)):
            # Create a new computation graph
            dy.renew_cg()
            encoder.associate_parameters()
            decoder.associate_parameters()

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            train_X_mb = train_X[start:end]
            train_y_mb = train_y[start:end]

            losses = []
            for x, t in zip(train_X_mb, train_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                he = encoder(x_embs)

                # Decoder
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                decoder.set_initial_states(he)
                y, KL = decoder(t_embs)

                loss = dy.esum([
                    dy.pickneglogsoftmax(y_t, t_t) + KL_t
                    for y_t, t_t, KL_t in zip(y, t_out, KL)
                ])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        for i in range(n_batches_valid):
            # Create a new computation graph
            dy.renew_cg()
            encoder.associate_parameters()
            decoder.associate_parameters()

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            valid_X_mb = valid_X[start:end]
            valid_y_mb = valid_y[start:end]

            losses = []
            for x, t in zip(valid_X_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                he = encoder(x_embs)

                # Decoder
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                decoder.set_initial_states(he)
                y, KL = decoder(t_embs)

                loss = dy.esum([
                    dy.pickneglogsoftmax(y_t, t_t) + KL_t
                    for y_t, t_t, KL_t in zip(y, t_out, KL)
                ])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' %
              (epoch + 1, np.mean(loss_all_train), np.mean(loss_all_valid)))

        # Save model ======================================================================================
        dy.save('./model_e' + str(epoch + 1), [V, encoder, decoder])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump',
                                                     'wb') as f_i2w:
            pickle.dump(w2i, f_w2i)
            pickle.dump(i2w, f_i2w)