def train():
    source_data_path = "D:\\nlp语料\\各类中文对话语料\\qingyun-11w\\sources.txt"
    target_data_path = "D:\\nlp语料\\各类中文对话语料\\qingyun-11w\\targets.txt"
    src_encoding = "utf-8"
    tgt_encoding = "utf-8"
    source_split_char = " "
    target_split_char = " "
    model_path = "../modelFile/nmt/attention/model.ckpt"
    batch_size = 80
    epochs = 40
    smwf = 2
    tmwf = 2
    dataInfoObj, gen = load_data(source_data_path,target_data_path,source_split_char,target_split_char,source_minimum_word_frequency=smwf,target_minimum_word_frequency=tmwf,batch_size=batch_size,epochs=epochs,source_encoding=src_encoding,target_encoding=tgt_encoding)
    f = open("../modelFile/nmt/attention/model.dataInfoObj","wb")
    pickle.dump(dataInfoObj,f)
    f.close()
    src_embedding_size = 200
    tgt_embedding_size = 200
    is_encoder_bidirectional = True
    rnn_layer_size = 4
    rnn_num_units = 256
    cell_type = "LSTM"
    lr = 0.001
    decoding_method = "beamSearch"
    attention_mechanism = "normed_bahdanau"
    model = AttentionSeq2SeqModel(src_vocab_size=dataInfoObj.source_vocab_size,tgt_time_step=dataInfoObj.target_max_len,tgt_vocab_size=dataInfoObj.target_vocab_size,start_token_id=dataInfoObj.target_token_2_id['<s>'],end_toekn_id=dataInfoObj.target_token_2_id['</s>'],attention_mechanism=attention_mechanism,batch_size=batch_size)
    model.train(model_path, gen, src_embedding_size, tgt_embedding_size, is_encoder_bidirectional,rnn_layer_size, rnn_num_units, cell_type, lr,decoding_method=decoding_method,beam_width=10)
Exemplo n.º 2
0
def train():
    source_data_path = "../data/letters_source2.txt"
    target_data_path = "../data/letters_target2.txt"
    #model_path = "../modelFile/testAttentionSeq2Seq/model_beam_search.ckpt"
    model_path = "../modelFile/testAttentionSeq2Seq/model_greedy.ckpt"
    batch_size = 128
    epochs = 20
    dataInfoObj, gen = load_data(source_data_path,target_data_path,None,None,source_minimum_word_frequency=1,target_minimum_word_frequency=1,batch_size=batch_size,epochs=epochs)
    #保存数据集的一些信息
    f = open("../modelFile/testAttentionSeq2Seq/model.dataInfoObj","wb")
    pickle.dump(dataInfoObj,f)
    f.close()
    #超参数开始
    src_embedding_size = 15
    tgt_embedding_size = 15
    '''
      encoder是否双向
                注意:使用bidirectional,encoder rnn的num_units变为decoder的一半,这是为了能够保证encoder_states和decoder的输入shape能对应上
    '''
    is_encoder_bidirectional = True
    rnn_layer_size = 2
    rnn_num_units = 128
    cell_type = "LSTM"
    lr = 0.001
    decoding_method = "greedy"
    attention_mechanism = "scaled_luong"
    #训练
    model = AttentionSeq2SeqModel(src_vocab_size=dataInfoObj.source_vocab_size,tgt_time_step=dataInfoObj.target_max_len,tgt_vocab_size=dataInfoObj.target_vocab_size,start_token_id=dataInfoObj.target_token_2_id['<s>'],end_toekn_id=dataInfoObj.target_token_2_id['</s>'],attention_mechanism=attention_mechanism,batch_size=batch_size)
    model.train(model_path, gen, src_embedding_size, tgt_embedding_size, is_encoder_bidirectional,rnn_layer_size, rnn_num_units, cell_type, lr,decoding_method=decoding_method,beam_width=10)
Exemplo n.º 3
0
def train():
    source_data_path = "D:\\nlp语料\\各类中文对话语料\\qingyun-11w\\sources.txt"
    target_data_path = "D:\\nlp语料\\各类中文对话语料\\qingyun-11w\\targets.txt"
    model_path = "../modelFile/chatbot/model_basic/model.ckpt"
    src_encoding = "utf-8"
    tgt_encoding = "utf-8"
    source_split_char = " "
    target_split_char = " "
    smwf = 2  #source 最小词频
    tmwf = 2  #target最小词频
    batch_size = 50
    epochs = 40
    dataInfoObj, gen = load_data(source_data_path,
                                 target_data_path,
                                 source_split_char,
                                 target_split_char,
                                 source_minimum_word_frequency=smwf,
                                 target_minimum_word_frequency=tmwf,
                                 batch_size=batch_size,
                                 epochs=epochs,
                                 source_encoding=src_encoding,
                                 target_encoding=tgt_encoding)
    #保存数据集的一些信息
    f = open("../modelFile/chatbot/model_basic/model.dataInfoObj", "wb")
    pickle.dump(dataInfoObj, f)
    f.close()
    #超参数开始
    src_embedding_size = 200
    tgt_embedding_size = 200
    '''
      encoder是否双向
                注意:使用bidirectional,encoder rnn的num_units变为decoder的一半,这是为了能够保证encoder_states和decoder的输入shape能对应上
    '''
    is_encoder_bidirectional = True
    rnn_layer_size = 4
    rnn_num_units = 256
    cell_type = "LSTM"
    lr = 0.001
    decoding_method = "beamSearch"
    #训练
    model = BasicSeq2SeqModel(
        src_vocab_size=dataInfoObj.source_vocab_size,
        tgt_time_step=dataInfoObj.target_max_len,
        tgt_vocab_size=dataInfoObj.target_vocab_size,
        start_token_id=dataInfoObj.target_token_2_id['<s>'],
        end_toekn_id=dataInfoObj.target_token_2_id['</s>'])
    model.train(model_path,
                gen,
                src_embedding_size,
                tgt_embedding_size,
                is_encoder_bidirectional,
                rnn_layer_size,
                rnn_num_units,
                cell_type,
                lr,
                decoding_method=decoding_method,
                beam_width=10)
Exemplo n.º 4
0
import tensorflow as tf
from tensorflow.python.keras.layers.core import Dense
from utils.dataPreprocessing import load_data
import numpy as np
import pickle
'''
所以数据直接从dataPreprocessing中导入
'''
source_data_path = "../data/letters_source2.txt"
target_data_path = "../data/letters_target2.txt"
batch_size = 128
epochs = 60
dataInfoObj, gen = load_data(source_data_path,
                             target_data_path,
                             None,
                             None,
                             source_minimum_word_frequency=1,
                             target_minimum_word_frequency=1,
                             batch_size=batch_size,
                             epochs=epochs)
#保存数据集的一些信息
f = open("./model.dataInfoObj", "wb")
pickle.dump(dataInfoObj, f)
f.close()
#
num_batch = dataInfoObj.num_samples // batch_size
source_max_len = dataInfoObj.source_max_len
source_token_list = dataInfoObj.source_token_list
source_token_2_id = dataInfoObj.source_token_2_id
target_max_len = dataInfoObj.target_max_len
target_token_list = dataInfoObj.target_token_list
target_token_2_id = dataInfoObj.target_token_2_id