Exemplo n.º 1
0
from utils import embedding
import os

data_path = 'data/'  # path of data folder
embeddingHandler = embedding.Embedding()
# create vocabulary

en_train_path = os.path.join(data_path, "en_train.txt")
en_test_path = os.path.join(data_path, "en_test.txt")
en_vocab_path = os.path.join(data_path, "en_vocab.txt")

cn_train_path = os.path.join(data_path, "cn_train.txt")
cn_test_path = os.path.join(data_path, "cn_test.txt")
cn_vocab_path = os.path.join(data_path, "cn_vocab.txt")

en_train_ids_path = os.path.join(data_path, "en_train_ids.txt")
en_test_ids_path = os.path.join(data_path, "en_test_ids.txt")
cn_train_ids_path = os.path.join(data_path, "cn_train_ids.txt")
cn_test_ids_path = os.path.join(data_path, "cn_test_ids.txt")

embeddingHandler.create_ids(en_train_path, en_vocab_path, en_train_ids_path)
embeddingHandler.create_ids(en_test_path, en_vocab_path, en_test_ids_path)
embeddingHandler.create_ids(cn_train_path, cn_vocab_path, cn_train_ids_path)
embeddingHandler.create_ids(cn_test_path, cn_vocab_path, cn_test_ids_path)
'''
en_path = "data/en.txt"
en_vocab_path = "data/en_vocab.txt"
cn_path = "data/cn.txt"
cn_vocab_path = "data/cn_vocab.txt"
embeddingHandler.create_vocab(en_path, en_vocab_path)
print(en_vocab_path)
Exemplo n.º 2
0
def test_model(model_path, src_file_name, tgt_file_name, beam_width=1):
    infer_graph = tf.Graph()
    with infer_graph.as_default():
        data_path = 'data/'  # path of data folder
        embeddingHandler = embedding.Embedding()

        ############### load embedding for source language ###############
        src_input_path = data_path + src_file_name  # path to training file used for encoder
        src_embedding_output_path = data_path + 'embedding.vi'  # path to file word embedding
        src_vocab_path = data_path + 'vocab.vi'  # path to file vocabulary

        vocab_src, dic_src = embeddingHandler.load_vocab(src_vocab_path)
        sentences_src = embeddingHandler.load_sentences(src_input_path)
        if not os.path.exists(src_embedding_output_path):
            word2vec_src = embeddingHandler.create_embedding(
                sentences_src, vocab_src, src_embedding_output_path)
        else:
            word2vec_src = embeddingHandler.load_embedding(
                src_embedding_output_path)
        embedding_src = embeddingHandler.parse_embedding_to_list_from_vocab(
            word2vec_src, vocab_src)
        embedding_src = tf.constant(embedding_src)

        ################ load embedding for target language ####################
        tgt_input_path = data_path + tgt_file_name
        tgt_embedding_output_path = data_path + 'embedding.en'
        tgt_vocab_path = data_path + 'vocab.en'

        vocab_tgt, dic_tgt = embeddingHandler.load_vocab(tgt_vocab_path)
        sentences_tgt = embeddingHandler.load_sentences(tgt_input_path)
        if not os.path.exists(tgt_embedding_output_path):
            word2vec_tgt = embeddingHandler.create_embedding(
                sentences_tgt, vocab_tgt, tgt_embedding_output_path)
        else:
            word2vec_tgt = embeddingHandler.load_embedding(
                tgt_embedding_output_path)
        embedding_tgt = embeddingHandler.parse_embedding_to_list_from_vocab(
            word2vec_tgt, vocab_tgt)
        embedding_tgt = tf.constant(embedding_tgt)

        if word2vec_src.vector_size != word2vec_tgt.vector_size:
            print('Word2Vec dimension not equal')
            exit(1)
        if len(sentences_src) != len(sentences_tgt):
            print('Source and Target data not match number of lines')
            exit(1)
        word2vec_dim = word2vec_src.vector_size  # dimension of a vector of word

        ################## create dataset ######################
        batch_size = 64

        # create training set for encoder (source)
        sentences_src_as_ids = embeddingHandler.convert_sentences_to_ids(
            dic_src, sentences_src)
        for sentence in sentences_src_as_ids:  # add <eos>
            sentence.append(eos_vocab_id)
        test_set_src = create_dataset(sentences_src_as_ids)
        test_set_src_len = create_dataset([[len(s)]
                                           for s in sentences_src_as_ids])

        # create training set for decoder (target)
        sentences_tgt_as_ids = embeddingHandler.convert_sentences_to_ids(
            dic_tgt, sentences_tgt)
        # for sentence_as_ids in sentences_tgt_as_ids:  # add </s> id to the end of each sentence of target language
        #     sentence_as_ids.append(eos_vocab_id)
        test_set_tgt = create_dataset(sentences_tgt_as_ids)
        test_set_tgt_len = create_dataset([[len(sentence) + 1]
                                           for sentence in sentences_tgt_as_ids
                                           ])
        # Note: [len(sentence)+1] for later <sos>/<eos>
        test_set_tgt_padding = create_dataset([
            np.ones(len(sentence) + 1, np.float32)
            for sentence in sentences_tgt_as_ids
        ])

        # create dataset contains both previous training sets
        train_dataset = tf.data.Dataset.zip(
            (test_set_src, test_set_tgt, test_set_src_len, test_set_tgt_len,
             test_set_tgt_padding))
        train_dataset = train_dataset.apply(
            tf.contrib.data.padded_batch_and_drop_remainder(
                batch_size, ([None], [None], [1], [1], [None])))
        train_iter = train_dataset.make_initializable_iterator()
        x_batch, y_batch, len_xs, len_ys, padding_mask = train_iter.get_next()
        # Note: len_xs and len_ys have shape [batch_size, 1]

        #################### build graph ##########################
        hidden_size = word2vec_dim  # number of hidden unit
        encode_seq_lens = tf.reshape(len_xs, shape=[batch_size])
        # ---------encoder first layer
        enc_1st_outputs, enc_1st_states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=tf.nn.rnn_cell.BasicLSTMCell(hidden_size),
            cell_bw=tf.nn.rnn_cell.BasicLSTMCell(hidden_size),
            inputs=tf.nn.embedding_lookup(embedding_src, x_batch),
            sequence_length=encode_seq_lens,
            swap_memory=True,
            time_major=False,
            dtype=tf.float32)  # [batch, time, hid]
        fw_enc_1st_hid_states, bw_enc_1st_hid_states = enc_1st_outputs
        # fw_enc_1st_last_hid, bw_enc_1st_last_hid = enc_1st_states

        # ----------encoder second layer
        num_layers = 2
        stacked_lstm = tf.nn.rnn_cell.MultiRNNCell(
            [tf.nn.rnn_cell.BasicLSTMCell(hidden_size * 2)] * num_layers)
        enc_2nd_outputs, enc_2nd_states = tf.nn.dynamic_rnn(
            cell=stacked_lstm,
            inputs=tf.concat([fw_enc_1st_hid_states, bw_enc_1st_hid_states],
                             axis=-1),
            sequence_length=encode_seq_lens,
            dtype=tf.float32,
            swap_memory=True,
            time_major=False)

        # ----------decoder
        encode_output_size = hidden_size * 2
        # decode_seq_lens = tf.reshape(len_ys, shape=[batch_size])
        decode_seq_lens = encode_seq_lens * 2  # maximum iterations
        attention_output_size = 256
        attention_mechanism = tf.contrib.seq2seq.LuongAttention(
            num_units=encode_output_size,
            memory=enc_2nd_outputs,  # require [batch, time, ...]
            memory_sequence_length=encode_seq_lens,
            dtype=tf.float32)
        attention_cell = tf.nn.rnn_cell.BasicLSTMCell(
            num_units=encode_output_size)
        attention_cell = tf.contrib.seq2seq.AttentionWrapper(
            attention_cell,
            attention_mechanism,
            attention_layer_size=attention_output_size)
        state_to_clone = attention_cell.zero_state(dtype=tf.float32,
                                                   batch_size=batch_size)
        decoder_initial_state = tf.contrib.seq2seq.AttentionWrapperState(
            cell_state=tf.nn.rnn_cell.LSTMStateTuple(c=tf.zeros_like(
                enc_2nd_states[-1].c, dtype=tf.float32),
                                                     h=enc_2nd_states[-1].h),
            attention=state_to_clone.attention,
            time=state_to_clone.time,
            alignments=state_to_clone.alignments,
            alignment_history=state_to_clone.alignment_history,
            attention_state=state_to_clone.attention_state)

        # projection
        tgt_vocab_size = len(vocab_tgt)
        weight_score = tf.Variable(
            tf.random_uniform(shape=[attention_output_size, tgt_vocab_size],
                              minval=-0.1,
                              maxval=0.1))
        bias_score = tf.Variable(tf.zeros([batch_size, tgt_vocab_size]))

        # beam search
        def loop_fn(time, cell_output, cell_state, log_probs, beam_finished):
            elements_finished = time >= decode_seq_lens  # finish by sentence length
            if cell_output is None:  # initialize step
                next_cell_state = tuple(decoder_initial_state
                                        for _ in range(beam_width))
                next_input = tuple(
                    tf.nn.embedding_lookup(embedding_tgt, [sos_vocab_id] *
                                           batch_size)
                    for _ in range(beam_width))
                predicted_ids = tf.convert_to_tensor(
                    [0] * beam_width
                )  # https://github.com/hanxiao/hanxiao.github.io/issues/8
                new_log_probs = tf.zeros([batch_size, beam_width])
                new_beam_finished = tf.fill([batch_size, beam_width],
                                            value=False)
                parent_indexs = None
            else:

                def not_time_0():
                    next_cell_state = cell_state
                    # find predicted_ids
                    values_list = []
                    indices_list = []
                    for i in range(beam_width):
                        score = tf.add(tf.matmul(cell_output[i], weight_score),
                                       bias_score)
                        softmax = tf.nn.softmax(score)
                        log_prob = tf.log(softmax)
                        values, indices = tf.nn.top_k(
                            log_prob, beam_width,
                            sorted=True)  # [batch, beam], [batch, beam]
                        # Note: indices is ids of words as well
                        values = tf.add(values,
                                        tf.expand_dims(
                                            log_probs[:, i],
                                            -1))  # sum with previous log_prob
                        values_list.append(values)
                        indices_list.append(indices)
                    concat_vlist = tf.concat(
                        tf.unstack(values_list, axis=0),
                        axis=-1)  # [batch_size, beam_width*beam_width]
                    concat_ilist = tf.concat(tf.unstack(indices_list, axis=0),
                                             axis=-1)
                    top_values, index_in_vlist = tf.nn.top_k(
                        concat_vlist, beam_width,
                        sorted=True)  # [batch_size, beam_width]
                    # Note: in tf.nn.top_k, if sorted=False then it's values will be SORTED ASCENDING

                    predicted_ids = get_word_ids(index_in_vlist, concat_ilist,
                                                 batch_size)
                    predicted_ids = tf.stack(
                        predicted_ids)  # [batch_size, beam_width]

                    # new_beam_finished = tf.logical_or(tf.equal(predicted_ids, eos_vocab_id), beam_finished)

                    # find parent_ids that match word_ids_to_add
                    parent_indexs = index_in_vlist // beam_width
                    # find new_log_probs
                    new_log_probs = top_values

                    # shift top-k according to beam_finished
                    # which means we will shift predicted_ids, new_log_probs, parent_indexs
                    def shift(tensor_1D, num_shift, vacancy_value):
                        """
                        shift from left to right
                        """
                        shift_value = tensor_1D[:beam_width - num_shift]
                        fill_vacancy = tf.fill([num_shift], vacancy_value)
                        return tf.concat([fill_vacancy, shift_value], axis=0)

                    ids_arr = []
                    probs_arr = []
                    parents_arr = []
                    num_shifts = tf.map_fn(
                        lambda beam: tf.reduce_sum(tf.cast(beam, tf.int32)),
                        beam_finished,
                        dtype=tf.int32)
                    # Note: we don't shift using new_beam_finished to avoid newly finish
                    # which will update -inf to final_log_probs
                    for i in range(batch_size):
                        num_shift = num_shifts[i]
                        ids_arr.append(
                            shift(predicted_ids[i], num_shift, eos_vocab_id))
                        probs_arr.append(
                            shift(new_log_probs[i], num_shift, -np.inf))
                        parents_arr.append(
                            shift(parent_indexs[i], num_shift, -1))
                    valid_shape = tf.shape(beam_finished)
                    predicted_ids = tf.stack(ids_arr)
                    predicted_ids = tf.reshape(predicted_ids, valid_shape)
                    new_log_probs = tf.stack(probs_arr)
                    new_log_probs = tf.reshape(new_log_probs, valid_shape)
                    parent_indexs = tf.stack(parents_arr)
                    parent_indexs = tf.reshape(parent_indexs, valid_shape)

                    new_beam_finished = tf.logical_or(
                        tf.equal(predicted_ids, eos_vocab_id), beam_finished)

                    # define next_input
                    finished = tf.reduce_all(elements_finished)
                    next_input = tuple(
                        tf.cond(
                            finished, lambda: tf.nn.embedding_lookup(
                                embedding_tgt, [eos_vocab_id] * batch_size),
                            lambda: tf.nn.embedding_lookup(
                                embedding_tgt, predicted_ids[:, i]))
                        for i in range(beam_width))

                    return elements_finished, next_input, next_cell_state, predicted_ids, new_log_probs, new_beam_finished, parent_indexs

                def time_0():
                    next_cell_state = cell_state
                    # find next_input
                    score = tf.add(tf.matmul(cell_output[0], weight_score),
                                   bias_score)
                    softmax = tf.nn.softmax(score)
                    log_prob = tf.log(softmax)
                    top_values, predicted_ids = tf.nn.top_k(
                        log_prob, beam_width,
                        sorted=True)  # [batch_size, beam_width]

                    new_beam_finished = beam_finished

                    parent_indexs = tf.fill([batch_size, beam_width], value=-1)

                    new_log_probs = top_values

                    finished = tf.reduce_all(elements_finished)
                    next_input = tuple(
                        tf.cond(
                            finished, lambda: tf.nn.embedding_lookup(
                                embedding_tgt, [eos_vocab_id] * batch_size),
                            lambda: tf.nn.embedding_lookup(
                                embedding_tgt, predicted_ids[:, i]))
                        for i in range(beam_width))

                    return elements_finished, next_input, next_cell_state, predicted_ids, new_log_probs, new_beam_finished, parent_indexs

                # Important note: we won't feed <sos> at step 0 because it will lead to all same results on all beams
                # instead, we feed top-k predictions generated from feeding <sos> as input
                # other returns will be pass without change
                elements_finished, next_input, next_cell_state, predicted_ids, new_log_probs, new_beam_finished, parent_indexs = tf.cond(
                    tf.equal(time, 0), time_0, not_time_0)

            return elements_finished, next_input, next_cell_state, predicted_ids, new_log_probs, new_beam_finished, parent_indexs

        predicted_ids_ta, parent_ids_ta, penalty_lengths, final_log_probs = raw_rnn_for_beam_search(
            attention_cell, loop_fn)
        translation_ta = extract_from_tree(predicted_ids_ta, parent_ids_ta,
                                           batch_size, beam_width)
        outputs = translation_ta.stack()  # [time, batch, beam]
        # choose best translation with maximum sum log probability
        normalize_log_probs = final_log_probs / penalty_lengths
        chosen_translations = tf.argmax(normalize_log_probs,
                                        axis=-1,
                                        output_type=tf.int32)  # [batch]
        transpose_outputs = tf.transpose(
            outputs, perm=[1, 2, 0])  # transpose to [batch, beam, time]
        final_output = get_word_ids(tf.expand_dims(chosen_translations, -1),
                                    transpose_outputs, batch_size)
        final_output = tf.stack(final_output)  # [batch, 1, time]
        final_output = tf.reshape(final_output,
                                  [batch_size, -1])  # [batch, time]

        #################### train ########################
        saver = tf.train.Saver()
        with tf.Session() as sess:
            saver.restore(sess, model_path)
            sess.run(train_iter.initializer)
            references = []
            # Note: references has shape 3-d to pass into compute_bleu function
            # first dimension is batch size, second dimension is number of references for 1 translation
            # third dimension is length of each sentence (maybe differ from each other)
            translation = []
            while True:
                # for i in range(10):
                #     print(i)
                try:
                    predictions, labels = sess.run([final_output, y_batch])
                    # perform trimming <eos> to not to get additional bleu score by overlap padding
                    predictions = [
                        np.trim_zeros(predict, 'b') for predict in predictions
                    ]
                    labels = [np.trim_zeros(lb, 'b') for lb in labels]
                    # # convert ids to words
                    # predictions = [embeddingHandler.ids_to_words(predict, vocab_tgt) for predict in predictions]
                    # labels = [embeddingHandler.ids_to_words(lb, vocab_tgt) for lb in labels]
                    references.extend(labels)
                    translation.extend(predictions)
                except tf.errors.OutOfRangeError:
                    break

            # compute bleu score
            reshaped_references = [[ref] for ref in references]
            bleu_score, *_ = bleu.compute_bleu(reshaped_references,
                                               translation,
                                               max_order=4,
                                               smooth=False)
            return bleu_score
def train_model():
    print('Loading word embeddings...')
    data_path = 'data/'  # path of data folder
    embeddingHandler = embedding.Embedding()

    ############### load embedding for source language ###############
    src_input_path = data_path + 'train.vi'  # path to training file used for encoder
    src_embedding_output_path = data_path + 'embedding.vi'  # path to file word embedding
    src_vocab_path = data_path + 'vocab.vi'  # path to file vocabulary

    vocab_src, dic_src = embeddingHandler.load_vocab(src_vocab_path)
    sentences_src = embeddingHandler.load_sentences(src_input_path)
    if not os.path.exists(src_embedding_output_path):
        word2vec_src = embeddingHandler.create_embedding(
            sentences_src, vocab_src, src_embedding_output_path)
    else:
        word2vec_src = embeddingHandler.load_embedding(
            src_embedding_output_path)
    embedding_src = embeddingHandler.parse_embedding_to_list_from_vocab(
        word2vec_src, vocab_src)
    embedding_src = tf.constant(embedding_src)

    ################ load embedding for target language ####################
    tgt_input_path = data_path + 'train.en'
    tgt_embedding_output_path = data_path + 'embedding.en'
    tgt_vocab_path = data_path + 'vocab.en'

    vocab_tgt, dic_tgt = embeddingHandler.load_vocab(tgt_vocab_path)
    sentences_tgt = embeddingHandler.load_sentences(tgt_input_path)
    if not os.path.exists(tgt_embedding_output_path):
        word2vec_tgt = embeddingHandler.create_embedding(
            sentences_tgt, vocab_tgt, tgt_embedding_output_path)
    else:
        word2vec_tgt = embeddingHandler.load_embedding(
            tgt_embedding_output_path)
    embedding_tgt = embeddingHandler.parse_embedding_to_list_from_vocab(
        word2vec_tgt, vocab_tgt)
    embedding_tgt = tf.constant(embedding_tgt)

    if word2vec_src.vector_size != word2vec_tgt.vector_size:
        print('Word2Vec dimension not equal')
        exit(1)
    if len(sentences_src) != len(sentences_tgt):
        print('Source and Target data not match number of lines')
        exit(1)
    word2vec_dim = word2vec_src.vector_size  # dimension of a vector of word
    training_size = len(sentences_src)
    print('Word2Vec dimension: ', word2vec_dim)
    print('-------------------------------')

    ################## create dataset ######################
    batch_size = 64
    num_epochs = 12
    print('Creating dataset...')
    print('Number of training examples: ', training_size)

    # create training set for encoder (source)
    sentences_src_as_ids = embeddingHandler.convert_sentences_to_ids(
        dic_src, sentences_src)
    for sentence in sentences_src_as_ids:  # add <eos>
        sentence.append(eos_vocab_id)
    train_set_src = create_dataset(sentences_src_as_ids)
    train_set_src_len = create_dataset([[len(s)]
                                        for s in sentences_src_as_ids])

    # create training set for decoder (target)
    sentences_tgt_as_ids = embeddingHandler.convert_sentences_to_ids(
        dic_tgt, sentences_tgt)
    # for sentence_as_ids in sentences_tgt_as_ids:  # add </s> id to the end of each sentence of target language
    #     sentence_as_ids.append(eos_vocab_id)
    train_set_tgt = create_dataset(sentences_tgt_as_ids)
    train_set_tgt_len = create_dataset([[len(sentence) + 1]
                                        for sentence in sentences_tgt_as_ids])
    # Note: [len(sentence)+1] for later <sos>/<eos>
    train_set_tgt_padding = create_dataset([
        np.ones(len(sentence) + 1, np.float32)
        for sentence in sentences_tgt_as_ids
    ])
    ## padding matrix
    # target_weights = create_dataset([np.ones(len(sentence) + 1) for sentence in sentences_tgt_as_ids])

    # create dataset contains both previous training sets
    train_dataset = tf.data.Dataset.zip(
        (train_set_src, train_set_tgt, train_set_src_len, train_set_tgt_len,
         train_set_tgt_padding))
    train_dataset = train_dataset.shuffle(buffer_size=training_size, seed=9)
    # train_dataset = train_dataset.shuffle(buffer_size=training_size)
    train_dataset = train_dataset.apply(
        tf.contrib.data.padded_batch_and_drop_remainder(
            batch_size, ([None], [None], [1], [1], [None])))
    train_iter = train_dataset.make_initializable_iterator()
    x_batch, y_batch, len_xs, len_ys, padding_mask = train_iter.get_next()
    # Note: len_xs and len_ys have shape [batch_size, 1]
    print('-------------------------------')
    #################### build graph ##########################
    hidden_size = word2vec_dim  # number of hidden unit
    print('Building graph...')
    encode_seq_lens = tf.reshape(len_xs, shape=[batch_size])
    # ---------encoder first layer
    enc_1st_outputs, enc_1st_states = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=tf.nn.rnn_cell.BasicLSTMCell(hidden_size),
        cell_bw=tf.nn.rnn_cell.BasicLSTMCell(hidden_size),
        inputs=tf.nn.embedding_lookup(embedding_src, x_batch),
        sequence_length=encode_seq_lens,
        swap_memory=True,
        time_major=False,
        dtype=tf.float32)  # [batch, time, hid]
    fw_enc_1st_hid_states, bw_enc_1st_hid_states = enc_1st_outputs
    # fw_enc_1st_last_hid, bw_enc_1st_last_hid = enc_1st_states

    # ----------encoder second layer
    num_layers = 2
    stacked_lstm = tf.nn.rnn_cell.MultiRNNCell(
        [tf.nn.rnn_cell.BasicLSTMCell(hidden_size * 2)] * num_layers)
    enc_2nd_outputs, enc_2nd_states = tf.nn.dynamic_rnn(
        cell=stacked_lstm,
        inputs=tf.concat([fw_enc_1st_hid_states, bw_enc_1st_hid_states],
                         axis=-1),
        sequence_length=encode_seq_lens,
        dtype=tf.float32,
        swap_memory=True,
        time_major=False)

    # ----------decoder
    encode_output_size = hidden_size * 2
    decode_seq_lens = tf.reshape(len_ys, shape=[batch_size])
    attention_output_size = 256
    attention_mechanism = tf.contrib.seq2seq.LuongAttention(
        num_units=encode_output_size,
        memory=enc_2nd_outputs,  # require [batch, time, ...]
        memory_sequence_length=encode_seq_lens,
        dtype=tf.float32)
    attention_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=encode_output_size)
    attention_cell = tf.contrib.seq2seq.AttentionWrapper(
        attention_cell,
        attention_mechanism,
        attention_layer_size=attention_output_size)
    add_sos = tf.concat(
        [tf.reshape([sos_vocab_id] * batch_size, [batch_size, 1]), y_batch],
        axis=-1)
    state_to_clone = attention_cell.zero_state(dtype=tf.float32,
                                               batch_size=batch_size)
    decoder_initial_state = tf.contrib.seq2seq.AttentionWrapperState(
        cell_state=tf.nn.rnn_cell.LSTMStateTuple(c=tf.zeros_like(
            enc_2nd_states[-1].c, dtype=tf.float32),
                                                 h=enc_2nd_states[-1].h),
        attention=state_to_clone.attention,
        time=state_to_clone.time,
        alignments=state_to_clone.alignments,
        alignment_history=state_to_clone.alignment_history,
        attention_state=state_to_clone.attention_state)
    dec_outputs, _ = tf.nn.dynamic_rnn(cell=attention_cell,
                                       inputs=tf.nn.embedding_lookup(
                                           embedding_tgt,
                                           tf.transpose(add_sos)),
                                       initial_state=decoder_initial_state,
                                       sequence_length=decode_seq_lens,
                                       dtype=tf.float32,
                                       swap_memory=True,
                                       time_major=True)

    # -----------calculate score
    tgt_vocab_size = len(vocab_tgt)
    weight_score = tf.Variable(
        tf.random_uniform(shape=[attention_output_size, tgt_vocab_size],
                          minval=-0.1,
                          maxval=0.1))
    bias_score = tf.Variable(tf.zeros([batch_size, tgt_vocab_size]))
    dec_outputs_len = tf.shape(dec_outputs)[0]

    def cond(i, *_):
        return tf.less(i, dec_outputs_len)

    def body(i, _logits):
        score = tf.add(tf.matmul(dec_outputs[i], weight_score), bias_score)
        return i + 1, _logits.write(i, score)

    _, logits = tf.while_loop(cond,
                              body,
                              loop_vars=[
                                  0,
                                  tf.TensorArray(tf.float32,
                                                 size=dec_outputs_len,
                                                 clear_after_read=True)
                              ],
                              swap_memory=True)
    labels = tf.transpose(
        tf.concat([
            y_batch,
            tf.reshape([eos_vocab_id] * batch_size, [batch_size, 1])
        ],
                  axis=-1))

    # ----------loss
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=labels, logits=logits.stack())  # [time,batch]
    apply_penalty = cross_entropy * tf.transpose(
        tf.cast(padding_mask, tf.float32))
    loss = tf.reduce_sum(apply_penalty) / batch_size

    # ----------optimizer
    global_step = tf.Variable(0, trainable=False, name='global_step')
    params = tf.trainable_variables()
    gradients = tf.gradients(loss, params)  # derivation of loss by params
    max_gradient_norm = 5
    clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
    starting_rate = 1.0
    decay_epochs = 4  # decay learning rate on every n epochs exclude first n epochs
    decay_step = (training_size //
                  batch_size) * decay_epochs  # num_step_in_single_epoch * n
    learning_rate = tf.train.exponential_decay(learning_rate=starting_rate,
                                               global_step=global_step,
                                               decay_steps=decay_step,
                                               decay_rate=0.1,
                                               staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    optimizer = optimizer.apply_gradients(zip(clipped_gradients, params),
                                          global_step=global_step)

    #################### train ########################
    log_frequency = 100
    model_path = "./checkpoint_v2/model"
    checkpoint_path = "./checkpoint_v2"
    loss_epochs = tf.TensorArray(tf.float32,
                                 size=num_epochs,
                                 dynamic_size=True)
    training_epoch = tf.Variable(0, trainable=False, name='training_epoch')
    saver = tf.train.Saver()
    with tf.Session() as sess:
        try:
            saver.restore(
                sess,
                tf.train.latest_checkpoint(checkpoint_dir=checkpoint_path))
            print('...............Restored from checkpoint_v2')
        except:
            sess.run(tf.global_variables_initializer())
        start_epoch = sess.run(training_epoch)
        for epoch in range(start_epoch, num_epochs):
            print('Training epoch', epoch + 1)
            start_time = time.time()
            total_loss = 0
            sess.run(train_iter.initializer)
            while True:
                try:
                    _, l, lr, step = sess.run(
                        [optimizer, loss, learning_rate, global_step])
                    total_loss += l
                    if np.isnan(l):
                        return False
                    # print('Step {0}: loss={1} lr={2}'.format(step, l, lr))
                    if step % log_frequency == 0:
                        print('Step {0}: loss={1} lr={2}'.format(step, l, lr))
                except tf.errors.OutOfRangeError:
                    avg_loss = total_loss / (training_size // batch_size)
                    loss_epochs = loss_epochs.write(
                        epoch,
                        tf.cast(avg_loss,
                                tf.float32))  # write average loss of epoch
                    sess.run(
                        training_epoch.assign(epoch +
                                              1))  # starting epoch if restore
                    path = saver.save(sess, model_path, epoch)
                    print('Average loss=', avg_loss)
                    bleu = infer_attention_model_v2.test_model(
                        path, 'tst2012.vi', 'tst2012.en')
                    print('bleu={}'.format(bleu * 100))
                    break

            print('Epoch {} train in {} minutes'.format(
                epoch + 1, (time.time() - start_time) / 60.0))
            print('------------------------------------')

        loss_summary = sess.run(loss_epochs.stack())
        loss_epochs.close()
        np.savetxt(checkpoint_path + '/loss_summary.txt',
                   loss_summary,
                   fmt='%10.5f')
        return True
def test_model(model_path, src_file_name, tgt_file_name):
    infer_graph = tf.Graph()
    with infer_graph.as_default():
        data_path = 'data/'  # path of data folder
        embeddingHandler = embedding.Embedding()

        ############### load embedding for source language ###############
        src_input_path = data_path + src_file_name  # path to training file used for encoder
        src_embedding_output_path = data_path + 'embedding.vi'  # path to file word embedding
        src_vocab_path = data_path + 'vocab.vi'  # path to file vocabulary

        vocab_src, dic_src = embeddingHandler.load_vocab(src_vocab_path)
        sentences_src = embeddingHandler.load_sentences(src_input_path)
        if not os.path.exists(src_embedding_output_path):
            word2vec_src = embeddingHandler.create_embedding(
                sentences_src, vocab_src, src_embedding_output_path)
        else:
            word2vec_src = embeddingHandler.load_embedding(
                src_embedding_output_path)
        embedding_src = embeddingHandler.parse_embedding_to_list_from_vocab(
            word2vec_src, vocab_src)
        embedding_src = tf.constant(embedding_src)

        ################ load embedding for target language ####################
        tgt_input_path = data_path + tgt_file_name
        tgt_embedding_output_path = data_path + 'embedding.en'
        tgt_vocab_path = data_path + 'vocab.en'

        vocab_tgt, dic_tgt = embeddingHandler.load_vocab(tgt_vocab_path)
        sentences_tgt = embeddingHandler.load_sentences(tgt_input_path)
        if not os.path.exists(tgt_embedding_output_path):
            word2vec_tgt = embeddingHandler.create_embedding(
                sentences_tgt, vocab_tgt, tgt_embedding_output_path)
        else:
            word2vec_tgt = embeddingHandler.load_embedding(
                tgt_embedding_output_path)
        embedding_tgt = embeddingHandler.parse_embedding_to_list_from_vocab(
            word2vec_tgt, vocab_tgt)
        embedding_tgt = tf.constant(embedding_tgt)

        word2vec_dim = word2vec_src.vector_size  # dimension of a vector of word

        ################## create dataset ######################
        batch_size = 64

        # create training set for encoder (source)
        sentences_src_as_ids = embeddingHandler.convert_sentences_to_ids(
            dic_src, sentences_src)
        for sentence in sentences_src_as_ids:  # add <eos>
            sentence.append(eos_vocab_id)
        test_set_src = create_dataset(sentences_src_as_ids)
        test_set_src_len = create_dataset([[len(s)]
                                           for s in sentences_src_as_ids])

        # create training set for decoder (target)
        sentences_tgt_as_ids = embeddingHandler.convert_sentences_to_ids(
            dic_tgt, sentences_tgt)
        # for sentence_as_ids in sentences_tgt_as_ids:  # add </s> id to the end of each sentence of target language
        #     sentence_as_ids.append(eos_vocab_id)
        test_set_tgt = create_dataset(sentences_tgt_as_ids)
        test_set_tgt_len = create_dataset([[len(sentence) + 1]
                                           for sentence in sentences_tgt_as_ids
                                           ])
        # Note: [len(sentence)+1] for later <sos>/<eos>
        test_set_tgt_padding = create_dataset([
            np.ones(len(sentence) + 1, np.float32)
            for sentence in sentences_tgt_as_ids
        ])
        ## padding matrix
        # target_weights = create_dataset([np.ones(len(sentence) + 1) for sentence in sentences_tgt_as_ids])

        # create dataset contains both previous training sets
        train_dataset = tf.data.Dataset.zip(
            (test_set_src, test_set_tgt, test_set_src_len, test_set_tgt_len,
             test_set_tgt_padding))
        # train_dataset = train_dataset.shuffle(buffer_size=training_size, seed=9)
        # train_dataset = train_dataset.shuffle(buffer_size=training_size)
        train_dataset = train_dataset.apply(
            tf.contrib.data.padded_batch_and_drop_remainder(
                batch_size, ([None], [None], [1], [1], [None])))
        train_iter = train_dataset.make_initializable_iterator()
        x_batch, y_batch, len_xs, len_ys, padding_mask = train_iter.get_next()
        # Note: len_xs and len_ys have shape [batch_size, 1]
        #################### build graph ##########################
        hidden_size = word2vec_dim  # number of hidden unit
        encode_seq_lens = tf.reshape(len_xs, shape=[batch_size])
        # ---------encoder first layer
        enc_1st_outputs, enc_1st_states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=tf.nn.rnn_cell.BasicLSTMCell(hidden_size),
            cell_bw=tf.nn.rnn_cell.BasicLSTMCell(hidden_size),
            inputs=tf.nn.embedding_lookup(embedding_src, x_batch),
            sequence_length=encode_seq_lens,
            swap_memory=True,
            time_major=False,
            dtype=tf.float32)  # [batch, time, hid]
        fw_enc_1st_hid_states, bw_enc_1st_hid_states = enc_1st_outputs
        # fw_enc_1st_last_hid, bw_enc_1st_last_hid = enc_1st_states

        # ----------encoder second layer
        num_layers = 2
        stacked_lstm = tf.nn.rnn_cell.MultiRNNCell(
            [tf.nn.rnn_cell.BasicLSTMCell(hidden_size * 2)] * num_layers)
        enc_2nd_outputs, enc_2nd_states = tf.nn.dynamic_rnn(
            cell=stacked_lstm,
            inputs=tf.concat([fw_enc_1st_hid_states, bw_enc_1st_hid_states],
                             axis=-1),
            sequence_length=encode_seq_lens,
            dtype=tf.float32,
            swap_memory=True,
            time_major=False)

        # ----------decoder
        encode_output_size = hidden_size * 2
        # decode_seq_lens = tf.reshape(len_ys, shape=[batch_size])
        decode_seq_lens = encode_seq_lens * 2  # maximum iterations
        attention_output_size = 256
        attention_mechanism = tf.contrib.seq2seq.LuongAttention(
            num_units=encode_output_size,
            memory=enc_2nd_outputs,  # require [batch, time, ...]
            memory_sequence_length=encode_seq_lens,
            dtype=tf.float32)
        attention_cell = tf.nn.rnn_cell.BasicLSTMCell(
            num_units=encode_output_size)
        attention_cell = tf.contrib.seq2seq.AttentionWrapper(
            attention_cell,
            attention_mechanism,
            attention_layer_size=attention_output_size)
        # add_sos = tf.concat([tf.reshape([sos_vocab_id]*batch_size, [batch_size, 1]), y_batch], axis=-1)
        decoder_initial_state = attention_cell.zero_state(
            dtype=tf.float32, batch_size=batch_size)
        decoder_initial_state = decoder_initial_state.clone(
            cell_state=enc_2nd_states[-1])

        # projection
        tgt_vocab_size = len(vocab_tgt)
        weight_score = tf.Variable(
            tf.random_uniform(shape=[attention_output_size, tgt_vocab_size],
                              minval=-0.1,
                              maxval=0.1))
        bias_score = tf.Variable(tf.zeros([batch_size, tgt_vocab_size]))

        # infer
        def loop_fn(time, cell_output, cell_state, loop_state):
            elements_finished = time >= decode_seq_lens  # finish by sentence length
            if cell_output is None:  # time = 0
                next_cell_state = decoder_initial_state
                next_input = tf.nn.embedding_lookup(
                    embedding_tgt, [sos_vocab_id] * batch_size)
                emit_output = tf.constant(
                    0)  # https://github.com/hanxiao/hanxiao.github.io/issues/8
            else:
                next_cell_state = cell_state
                score = tf.add(tf.matmul(cell_output, weight_score),
                               bias_score)
                softmax = tf.nn.softmax(score)
                predict = tf.argmax(softmax, axis=-1, output_type=tf.int32)
                elements_finished = tf.logical_or(
                    elements_finished,
                    tf.equal(predict,
                             eos_vocab_id))  # or finish by generated <eos>
                finished = tf.reduce_all(elements_finished)
                next_input = tf.cond(
                    finished, lambda: tf.nn.embedding_lookup(
                        embedding_tgt, [eos_vocab_id] * batch_size),
                    lambda: tf.nn.embedding_lookup(embedding_tgt, predict))
                emit_output = predict

            next_loop_state = None
            return elements_finished, next_input, next_cell_state, emit_output, next_loop_state

        outputs_ta, final_state, _ = tf.nn.raw_rnn(attention_cell, loop_fn)
        outputs = outputs_ta.stack()  # [time, batch]
        outputs = tf.transpose(outputs)  # reshape to [batch, time]
        # -----------calculate score

        # dec_outputs_len = tf.shape(dec_outputs)[0]
        # def cond(i, *_):
        #     return tf.less(i, dec_outputs_len)
        # def body(i, _logits):
        #     score = tf.add(
        #         tf.matmul(dec_outputs[i], weight_score), bias_score
        #     )
        #     return i+1, _logits.write(i, score)
        # _, logits = tf.while_loop(
        #     cond, body, loop_vars=[0, tf.TensorArray(tf.float32, size=dec_outputs_len, clear_after_read=True)], swap_memory=True
        # )
        # labels = tf.transpose(tf.concat([y_batch, tf.reshape([eos_vocab_id]*batch_size, [batch_size, 1])], axis=-1))
        #

        #################### infer ########################
        saver = tf.train.Saver()
        with tf.Session() as sess:
            # saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir=checkpoint_path))
            saver.restore(sess, model_path)
            sess.run(train_iter.initializer)
            references = []
            # Note: references has shape 3-d to pass into compute_bleu function
            # first dimension is batch size, second dimension is number of references for 1 translation
            # third dimension is length of each sentence (maybe differ from each other)
            translation = []
            while True:
                # for i in range(10):
                #     print(i)
                try:
                    predictions, labels = sess.run([outputs, y_batch])
                    # perform trimming <eos> to not to get additional bleu score by overlap padding
                    predictions = [
                        np.trim_zeros(predict, 'b') for predict in predictions
                    ]
                    labels = [np.trim_zeros(lb, 'b') for lb in labels]
                    # # convert ids to words
                    # predictions = [embeddingHandler.ids_to_words(predict, vocab_tgt) for predict in predictions]
                    # labels = [embeddingHandler.ids_to_words(lb, vocab_tgt) for lb in labels]
                    references.extend(labels)
                    translation.extend(predictions)
                except tf.errors.OutOfRangeError:
                    break

            # compute bleu score
            reshaped_references = [[ref] for ref in references]
            bleu_score, *_ = bleu.compute_bleu(reshaped_references,
                                               translation,
                                               max_order=4,
                                               smooth=False)
            return bleu_score