Пример #1
0
def train(config):
    # --------- configurations --------- #
    batch_size = config['batch_size']
    save_path = config['save']  # path to store model
    saved_model = config['load']  # None or path
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    # ---------------------------------- #
    write_config(save_path + '/config.txt', config)

    # random seed
    random.seed(config['random_seed'])
    # np.random.seed(config['random_seed'])

    batches, vocab_size, src_word2id, tgt_word2id = construct_training_data_batches(
        config)

    tgt_id2word = list(tgt_word2id.keys())

    params = {
        'vocab_src_size': vocab_size['src'],
        'vocab_tgt_size': vocab_size['tgt'],
        'go_id': tgt_word2id['<go>'],
        'eos_id': tgt_word2id['</s>']
    }

    model = EncoderDecoder(config, params)
    model.build_network()

    learning_rate = config['learning_rate']
    decay_rate = config['decay_rate']

    tf_variables = tf.trainable_variables()
    for i in range(len(tf_variables)):
        print(tf_variables[i])

    # save & restore model
    saver = tf.train.Saver(max_to_keep=1)

    if config['use_gpu']:
        if 'X_SGE_CUDA_DEVICE' in os.environ:
            print('running on the stack...')
            cuda_device = os.environ['X_SGE_CUDA_DEVICE']
            print('X_SGE_CUDA_DEVICE is set to {}'.format(cuda_device))
            os.environ['CUDA_VISIBLE_DEVICES'] = cuda_device

        else:  # development only e.g. air202
            print('running locally...')
            os.environ[
                'CUDA_VISIBLE_DEVICES'] = '1'  # choose the device (GPU) here

        sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.allow_growth = True  # Whether the GPU memory usage can grow dynamically.
        sess_config.gpu_options.per_process_gpu_memory_fraction = 0.95  # The fraction of GPU memory that the process can use.

    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        sess_config = tf.ConfigProto()

    with tf.Session(config=sess_config) as sess:

        if saved_model == None:
            sess.run(tf.global_variables_initializer())
            # ------------ load pre-trained embeddings ------------ #
            if config['load_embedding_src'] != None:
                src_embedding = sess.run(model.src_word_embeddings)
                src_embedding_matrix = load_pretrained_embedding(
                    src_word2id, src_embedding, config['load_embedding_src'])
                sess.run(
                    model.src_word_embeddings.assign(src_embedding_matrix))
            if config['load_embedding_tgt'] != None:
                if config['load_embedding_tgt'] == config[
                        'load_embedding_src']:
                    sess.run(
                        model.tgt_word_embeddings.assign(src_embedding_matrix))
                else:
                    tgt_embedding = sess.run(model.tgt_word_embeddings)
                    tgt_embedding_matrix = load_pretrained_embedding(
                        tgt_word2id, tgt_embedding,
                        config['load_embedding_tgt'])
                    sess.run(
                        model.tgt_word_embeddings.assign(tgt_embedding_matrix))
            # ----------------------------------------------------- #
        else:
            new_saver = tf.train.import_meta_graph(saved_model + '.meta')
            new_saver.restore(sess, saved_model)
            print('loaded model...', saved_model)

        # ------------ TensorBoard ------------ #
        # summary_writer = tf.summary.FileWriter(save_path + '/tfboard/', graph_def=sess.graph_def)
        # ------------------------------------- #

        # ------------ To print out some output -------------------- #
        my_sentences = [
            'this is test . </s>',
            'this is confirm my reservation at hotel . </s>',
            'playing tennis good for you . </s>',
            'when talking about successful longterm business relationships customer services are important element </s>'
        ]
        my_sent_ids = []
        for my_sentence in my_sentences:
            ids = []
            for word in my_sentence.split():
                if word in src_word2id:
                    ids.append(src_word2id[word])
                else:
                    ids.append(src_word2id['<unk>'])
            my_sent_ids.append(ids)
        my_sent_len = [len(my_sent) for my_sent in my_sent_ids]
        my_sent_ids = [
            ids + [src_word2id['</s>']] *
            (config['max_sentence_length'] - len(ids)) for ids in my_sent_ids
        ]
        infer_dict = {
            model.src_word_ids: my_sent_ids,
            model.src_sentence_lengths: my_sent_len,
            model.dropout: 0.0,
            model.learning_rate: learning_rate
        }
        # ---------------------------------------------------------- #

        num_epochs = config['num_epochs']
        for epoch in range(num_epochs):
            print("num_batches = ", len(batches))

            random.shuffle(batches)

            epoch_loss = 0

            for i, batch in enumerate(batches):

                feed_dict = {
                    model.src_word_ids: batch['src_word_ids'],
                    model.tgt_word_ids: batch['tgt_word_ids'],
                    model.src_sentence_lengths: batch['src_sentence_lengths'],
                    model.tgt_sentence_lengths: batch['tgt_sentence_lengths'],
                    model.dropout: config['dropout'],
                    model.learning_rate: learning_rate
                }

                [_, loss] = sess.run([model.train_op, model.train_loss],
                                     feed_dict=feed_dict)
                epoch_loss += loss

                if i % 100 == 0:
                    # to print out training status

                    # if config['decoding_method'] != 'beamsearch':
                    # [train_loss, infer_loss] = sess.run([model.train_loss, model.infer_loss], feed_dict=feed_dict)
                    # print("batch: {} --- train_loss: {:.5f} | inf_loss: {:.5f}".format(i, train_loss, infer_loss))

                    # else:
                    # --- beam search --- #
                    # [train_loss] = sess.run([model.train_loss], feed_dict=feed_dict)
                    # print("BEAMSEARCH - batch: {} --- train_loss: {:.5f}".format(i, train_loss))

                    print("batch: {} --- avg train loss: {:.5f}".format(
                        i, epoch_loss / (i + 1)))

                    sys.stdout.flush()

                if i % 500 == 0:

                    [my_translations] = sess.run([model.translations],
                                                 feed_dict=infer_dict)
                    # pdb.set_trace()
                    for my_sent in my_translations:
                        my_words = [tgt_id2word[id] for id in my_sent]
                        print(' '.join(my_words))

            model.increment_counter()
            learning_rate *= decay_rate

            print("---------------------------------------------------")
            print("epoch {} done".format(epoch + 1))
            print("total training loss = {}".format(epoch_loss))
            print("---------------------------------------------------")

            if math.isnan(epoch_loss):
                print("stop training - loss/gradient exploded")
                break

            saver.save(sess, save_path + '/model', global_step=epoch)
Пример #2
0
def translate(config):
    if 'X_SGE_CUDA_DEVICE' in os.environ:
        print('running on the stack...')
        cuda_device = os.environ['X_SGE_CUDA_DEVICE']
        print('X_SGE_CUDA_DEVICE is set to {}'.format(cuda_device))
        os.environ['CUDA_VISIBLE_DEVICES'] = cuda_device

    else:  # development only e.g. air202
        print('running locally...')
        os.environ[
            'CUDA_VISIBLE_DEVICES'] = '0'  # choose the device (GPU) here

    sess_config = tf.ConfigProto()

    vocab_paths = {
        'vocab_src': config['vocab_src'],
        'vocab_tgt': config['vocab_tgt']
    }
    src_word2id, tgt_word2id = load_vocab(vocab_paths)

    tgt_id2word = list(tgt_word2id.keys())

    params = {
        'vocab_src_size': len(src_word2id),
        'vocab_tgt_size': len(tgt_word2id),
        'go_id': tgt_word2id['<go>'],
        'eos_id': tgt_word2id['</s>']
    }

    # build the model
    model = EncoderDecoder(config, params)
    model.build_network()

    # save & restore model
    saver = tf.train.Saver()
    save_path = config['load']
    model_number = config['model_number'] if config[
        'model_number'] != None else config['num_epochs'] - 1
    full_save_path_to_model = save_path + '/model-' + str(model_number)

    # ------ PPL Parser for Fluency Score ------ #
    # parser = PplParser()
    # rnnlm_model = "/home/alta/BLTSpeaking/ged-pm574/gec-lm/train-rnnlm/rnnlms/v3/one-billion/RNN_weight.OOS.cuedrnnlm.rnnlm.300.300/train_LM.wgt.iter9"
    # # test_file = "/home/alta/BLTSpeaking/ged-pm574/nmt-exp/tmp/translate_ppl.txt"
    # test_file = config['tgtfile']
    # intermediatefile = "tmp/trans-intermediate.txt"
    # inputwlist = "/home/alta/BLTSpeaking/ged-pm574/gec-lm/train-rnnlm/rnnlms/v3/one-billion/lib/wlists/train.lst.index"
    # outputwlist = "/home/alta/BLTSpeaking/ged-pm574/gec-lm/train-rnnlm/rnnlms/v3/one-billion/lib/wlists/train.lst.index"
    # vocabsize = "64002"
    # parser.make_cmd(rnnlm_model, test_file, inputwlist, outputwlist, vocabsize, intermediatefile)
    # ------------------------------------------ #

    with tf.Session(config=sess_config) as sess:
        # Restore variables from disk.
        saver.restore(sess, full_save_path_to_model)
        # print("Model restored")

        src_sent_ids, src_sent_len = src_data(config['srcfile'], src_word2id,
                                              config['max_sentence_length'])

        num_sentences = len(src_sent_ids)
        batch_size = 1000
        num_batches = int(num_sentences / batch_size) + 1

        print('num_batches =', num_batches)

        beam_width = config['beam_width']

        outputs = []

        for i in range(num_batches):

            i_start = batch_size * i
            i_end = i_start + batch_size if i_start + batch_size <= num_sentences else num_sentences
            translate_dict = {
                model.src_word_ids: src_sent_ids[i_start:i_end],
                model.src_sentence_lengths: src_sent_len[i_start:i_end],
                model.dropout: 0.0
            }

            predicted_ids = sess.run(model.predicted_ids,
                                     feed_dict=translate_dict)

            for sentence in predicted_ids:
                beam = []
                for k in range(beam_width):
                    translation = sentence[:, k]
                    words = []
                    for id in translation:
                        if id == params['eos_id']:
                            break

                        words.append(tgt_id2word[id])

                    beam.append(words)

                outputs.append(beam)

            print('#', end='')
            sys.stdout.flush()

        print("num outputs: ", len(outputs))
        # for i in range(len(outputs)):
        #     if len(outputs[i]) != 10:
        #         pdb.set_trace()
        # print("no problem!")
        # pdb.set_trace()

        with open(config['tgtfile'], 'w', encoding="utf8") as file:
            for output in outputs:
                for beam in output:
                    x = "<s> " + " ".join(beam[:-1]).upper() + " </s>\n"
                    file.write(x)
Пример #3
0
def adapt(config):
    if 'X_SGE_CUDA_DEVICE' in os.environ:
        print('running on the stack...')
        cuda_device = os.environ['X_SGE_CUDA_DEVICE']
        print('X_SGE_CUDA_DEVICE is set to {}'.format(cuda_device))
        os.environ['CUDA_VISIBLE_DEVICES'] = cuda_device

    else:  # development only e.g. air202
        print('running locally...')
        os.environ[
            'CUDA_VISIBLE_DEVICES'] = '3'  # choose the device (GPU) here

    sess_config = tf.ConfigProto()

    batches, vocab_size, src_word2id, tgt_word2id = construct_training_data_batches(
        config)

    tgt_id2word = list(tgt_word2id.keys())

    params = {
        'vocab_src_size': len(src_word2id),
        'vocab_tgt_size': len(tgt_word2id),
        'go_id': tgt_word2id['<go>'],
        'eos_id': tgt_word2id['</s>']
    }

    # build the model
    model = EncoderDecoder(config, params)
    model.build_network()

    # -------- Adaption work -------- #
    bias_name = 'decoder/decode_with_shared_attention/decoder/dense/bias:0'
    weight_name = 'decoder/decode_with_shared_attention/decoder/dense/kernel:0'
    param_names = [bias_name, weight_name]
    # param_names = [var.name for var in tf.trainable_variables()]
    model.adapt_weights(param_names)
    # ------------------------------- #

    new_save_path = config['save']
    if not os.path.exists(new_save_path):
        os.makedirs(new_save_path)
    write_config(new_save_path + '/config.txt', config)

    # save & restore model
    saver = tf.train.Saver(max_to_keep=1)
    save_path = config['load']
    model_number = config['model_number'] if config[
        'model_number'] != None else config['num_epochs'] - 1
    full_save_path_to_model = save_path + '/model-' + str(model_number)

    with tf.Session(config=sess_config) as sess:
        # Restore variables from disk.
        saver.restore(sess, full_save_path_to_model)

        for epoch in range(10):
            print("num_batches = ", len(batches))

            random.shuffle(batches)

            for i, batch in enumerate(batches):
                feed_dict = {
                    model.src_word_ids: batch['src_word_ids'],
                    model.tgt_word_ids: batch['tgt_word_ids'],
                    model.src_sentence_lengths: batch['src_sentence_lengths'],
                    model.tgt_sentence_lengths: batch['tgt_sentence_lengths'],
                    model.dropout: config['dropout']
                }

                _ = sess.run([model.adapt_op], feed_dict=feed_dict)

                if i % 100 == 0:
                    # to print out training status

                    if config['decoding_method'] != 'beamsearch':
                        [train_loss, infer_loss
                         ] = sess.run([model.train_loss, model.infer_loss],
                                      feed_dict=feed_dict)
                        print(
                            "batch: {} --- train_loss: {:.5f} | inf_loss: {:.5f}"
                            .format(i, train_loss, infer_loss))

                    else:
                        # --- beam search --- #
                        [train_loss] = sess.run([model.train_loss],
                                                feed_dict=feed_dict)
                        print("BEAMSEARCH - batch: {} --- train_loss: {:.5f}".
                              format(i, train_loss))

                    sys.stdout.flush()

            model.increment_counter()
            print("################## EPOCH {} done ##################".format(
                epoch))
            saver.save(sess, new_save_path + '/model', global_step=epoch)
Пример #4
0
def translate(config):
    if 'X_SGE_CUDA_DEVICE' in os.environ:
        print('running on the stack...')
        cuda_device = os.environ['X_SGE_CUDA_DEVICE']
        print('X_SGE_CUDA_DEVICE is set to {}'.format(cuda_device))
        os.environ['CUDA_VISIBLE_DEVICES'] = cuda_device

    else: # development only e.g. air202
        print('running locally...')
        os.environ['CUDA_VISIBLE_DEVICES'] = '' # choose the device (GPU) here

    sess_config = tf.ConfigProto()

    vocab_paths = {'vocab_src': config['vocab_src'], 'vocab_tgt': config['vocab_tgt']}
    src_word2id, tgt_word2id = load_vocab(vocab_paths)

    tgt_id2word = list(tgt_word2id.keys())

    params = {'vocab_src_size': len(src_word2id),
            'vocab_tgt_size': len(tgt_word2id),
            'go_id':  tgt_word2id['<go>'],
            'eos_id':  tgt_word2id['</s>']}

    # build the model
    model = EncoderDecoder(config, params)
    model.build_network()

    # save & restore model
    saver = tf.train.Saver()
    save_path = config['load']
    model_number = config['model_number'] if config['model_number'] != None else config['num_epochs'] - 1
    full_save_path_to_model = save_path + '/model-' + str(model_number)

    with tf.Session(config=sess_config) as sess:
        # Restore variables from disk.
        saver.restore(sess, full_save_path_to_model)
        # print("Model restored")

        src_sent_ids, src_sent_len = src_data(config['srcfile'], src_word2id,
                                            config['max_sentence_length'], config['spellcheck'])

        num_sentences = len(src_sent_ids)
        # batch_size = config['batch_size'] # maybe too small (inefficient) - but should be not too large
        batch_size = 100 # this is okay - it requires much lower memory compared to training
        num_batches = int(num_sentences/batch_size) + 1

        tgt_lines = []
        print('num_batches =', num_batches)

        for i in range(num_batches):

            i_start = batch_size*i
            i_end = i_start+batch_size if i_start+batch_size <= num_sentences else num_sentences
            translate_dict = {model.src_word_ids: src_sent_ids[i_start:i_end],
                        model.src_sentence_lengths: src_sent_len[i_start:i_end],
                        model.dropout: 0.0}

            [translations] = sess.run([model.translations], feed_dict=translate_dict)

            for translation in translations:
                words = []
                for id in translation:
                    if id == params['eos_id']:
                        break
                    words.append(tgt_id2word[id])

                # print(' '.join(words))
                tgt_lines.append(' '.join(words))

            print('#')
            sys.stdout.flush()

        with open(config['tgtfile'], 'w') as file:
            for line in tgt_lines:
                file.write(line + '\n')
        print('translation done!')