def train(config): """ model training """ config.vocab_size = len(open(config.vocab_path).readlines()) bow_loss, kl_loss, nll_loss, final_loss= knowledge_seq2seq(config) bow_loss.persistable = True kl_loss.persistable = True nll_loss.persistable = True final_loss.persistable = True main_program = fluid.default_main_program() inference_program = fluid.default_main_program().clone(for_test=True) fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=config.grad_clip)) optimizer = fluid.optimizer.Adam(learning_rate=config.lr) if config.stage == 0: print("stage 0") optimizer.minimize(bow_loss) else: print("stage 1") optimizer.minimize(final_loss) opt_var_name_list = optimizer.get_opti_var_name_list() if config.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) param_list = main_program.block(0).all_parameters() param_name_list = [p.name for p in param_list] init_model(config, param_name_list, place) processors = KnowledgeCorpus( data_dir=config.data_dir, data_prefix=config.data_prefix, vocab_path=config.vocab_path, min_len=config.min_len, max_len=config.max_len) train_generator = processors.data_generator( batch_size=config.batch_size, phase="train", shuffle=True) valid_generator = processors.data_generator( batch_size=config.batch_size, phase="dev", shuffle=False) model_handle = [exe, place, bow_loss, kl_loss, nll_loss, final_loss] train_loop(config, train_generator, valid_generator, main_program, inference_program, model_handle, param_name_list, opt_var_name_list)
def __init__(self): """ Init whatever you need here """ vocab_file = 'data/vocab.txt' with codecs.open(vocab_file, 'r', 'utf-8') as f: vocab = [i.strip() for i in f.readlines() if len(i.strip()) != 0] self.vocab = vocab self.freqs = dict(zip(self.vocab[::-1], range(len(self.vocab)))) # Our code are as follows config = Config() torch.cuda.set_device(device=config.gpu) self.config = config # Data definition self.corpus = KnowledgeCorpus(data_dir=config.data_dir, data_prefix=config.data_prefix, min_freq=0, max_vocab_size=config.max_vocab_size, vocab_file=config.vocab_file, min_len=config.min_len, max_len=config.max_len, embed_file=config.embed_file, share_vocab=config.share_vocab) # Model definition self.model = Seq2Seq(src_vocab_size=self.corpus.SRC.vocab_size, tgt_vocab_size=self.corpus.TGT.vocab_size, embed_size=config.embed_size, hidden_size=config.hidden_size, padding_idx=self.corpus.padding_idx, num_layers=config.num_layers, bidirectional=config.bidirectional, attn_mode=config.attn, with_bridge=config.with_bridge, tie_embedding=config.tie_embedding, dropout=config.dropout, use_gpu=config.use_gpu) print(self.model) self.model.load(config.ckpt) # Generator definition self.generator = TopKGenerator(model=self.model, src_field=self.corpus.SRC, tgt_field=self.corpus.TGT, cue_field=self.corpus.CUE, beam_size=config.beam_size, max_length=config.max_dec_len, ignore_unk=config.ignore_unk, length_average=config.length_average, use_gpu=config.use_gpu) self.BOS = self.generator.BOS self.EOS = self.generator.EOS self.stoi = self.corpus.SRC.stoi self.itos = self.corpus.SRC.itos
def load(): """ load model for predict """ config = model_config() config.vocab_size = len(open(config.vocab_path).readlines()) final_score, final_ids, final_index = knowledge_seq2seq(config) final_score.persistable = True final_ids.persistable = True final_index.persistable = True main_program = fluid.default_main_program() if config.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) fluid.io.load_params(executor=exe, dirname=config.model_path, main_program=main_program) processors = KnowledgeCorpus( data_dir=config.data_dir, data_prefix=config.data_prefix, vocab_path=config.vocab_path, min_len=config.min_len, max_len=config.max_len) # load dict id_dict_array = load_id2str_dict(config.vocab_path) model_handle = [exe, place, final_score, final_ids, final_index, processors, id_dict_array] return model_handle
def main(): config = model_config() if config.check: config.save_dir = "./tmp/" config.use_gpu = torch.cuda.is_available() and config.gpu >= 0 device = config.gpu torch.cuda.set_device(device) # Data definition corpus = KnowledgeCorpus(data_dir=config.data_dir, data_prefix=config.data_prefix, min_freq=3, max_vocab_size=config.max_vocab_size, min_len=config.min_len, max_len=config.max_len, embed_file=config.embed_file, with_label=config.with_label, share_vocab=config.share_vocab) corpus.load() if config.test and config.ckpt: corpus.reload(data_type='test') train_iter = corpus.create_batches( config.batch_size, "train", shuffle=True, device=device) valid_iter = corpus.create_batches( config.batch_size, "valid", shuffle=False, device=device) test_iter = corpus.create_batches( config.batch_size, "test", shuffle=False, device=device) # Model definition model = KnowledgeSeq2Seq(src_vocab_size=corpus.SRC.vocab_size, tgt_vocab_size=corpus.TGT.vocab_size, embed_size=config.embed_size, hidden_size=config.hidden_size, padding_idx=corpus.padding_idx, num_layers=config.num_layers, bidirectional=config.bidirectional, attn_mode=config.attn, with_bridge=config.with_bridge, tie_embedding=config.tie_embedding, dropout=config.dropout, use_gpu=config.use_gpu, use_bow=config.use_bow, use_dssm=config.use_dssm, use_pg=config.use_pg, use_gs=config.use_gs, pretrain_epoch=config.pretrain_epoch, use_posterior=config.use_posterior, weight_control=config.weight_control, concat=config.decode_concat) model_name = model.__class__.__name__ # Generator definition generator = TopKGenerator(model=model, src_field=corpus.SRC, tgt_field=corpus.TGT, cue_field=corpus.CUE, max_length=config.max_dec_len, ignore_unk=config.ignore_unk, length_average=config.length_average, use_gpu=config.use_gpu) # Interactive generation testing if config.interact and config.ckpt: model.load(config.ckpt) return generator # Testing elif config.test and config.ckpt: print(model) model.load(config.ckpt) print("Testing ...") metrics, scores = evaluate(model, test_iter) print(metrics.report_cum()) print("Generating ...") evaluate_generation(generator, test_iter, save_file=config.gen_file, verbos=True) else: # Load word embeddings if config.use_embed and config.embed_file is not None: model.encoder.embedder.load_embeddings( corpus.SRC.embeddings, scale=0.03) model.decoder.embedder.load_embeddings( corpus.TGT.embeddings, scale=0.03) # Optimizer definition optimizer = getattr(torch.optim, config.optimizer)( model.parameters(), lr=config.lr) # Learning rate scheduler if config.lr_decay is not None and 0 < config.lr_decay < 1.0: lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, factor=config.lr_decay, patience=1, verbose=True, min_lr=1e-5) else: lr_scheduler = None # Save directory date_str, time_str = datetime.now().strftime("%Y%m%d-%H%M%S").split("-") result_str = "{}-{}".format(model_name, time_str) if not os.path.exists(config.save_dir): os.makedirs(config.save_dir) # Logger definition logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG, format="%(message)s") fh = logging.FileHandler(os.path.join(config.save_dir, "train.log")) logger.addHandler(fh) # Save config params_file = os.path.join(config.save_dir, "params.json") with open(params_file, 'w') as fp: json.dump(config.__dict__, fp, indent=4, sort_keys=True) print("Saved params to '{}'".format(params_file)) logger.info(model) # Train logger.info("Training starts ...") trainer = Trainer(model=model, optimizer=optimizer, train_iter=train_iter, valid_iter=valid_iter, logger=logger, generator=generator, valid_metric_name="-loss", num_epochs=config.num_epochs, save_dir=config.save_dir, log_steps=config.log_steps, valid_steps=config.valid_steps, grad_clip=config.grad_clip, lr_scheduler=lr_scheduler, save_summary=False) if config.ckpt is not None: trainer.load(file_prefix=config.ckpt) trainer.train() logger.info("Training done!") # Test logger.info("") trainer.load(os.path.join(config.save_dir, "best")) logger.info("Testing starts ...") metrics, scores = evaluate(model, test_iter) logger.info(metrics.report_cum()) logger.info("Generation starts ...") test_gen_file = os.path.join(config.save_dir, "test.result") evaluate_generation(generator, test_iter, save_file=test_gen_file, verbos=True)
def train(config): """ model training """ config.vocab_size = len(open(config.vocab_path).readlines()) #搭建网络:Bi-GRU Utterance encoder +Bi-GRU KG encoder +层级GRU decoder bow_loss, kl_loss, nll_loss, final_loss= knowledge_seq2seq(config) #持久性变量(Persistables)是一种在每次迭代结束后均不会被删除的变量 bow_loss.persistable = True kl_loss.persistable = True nll_loss.persistable = True final_loss.persistable = True #fluid.layers 接口中添加的op和variable会存储在 default main program 中 main_program = fluid.default_main_program() inference_program = fluid.default_main_program().clone(for_test=True) #给指定参数做梯度裁剪 fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=config.grad_clip)) optimizer = fluid.optimizer.Adam(learning_rate=config.lr) if config.stage == 0: print("stage 0") optimizer.minimize(bow_loss) else: print("stage 1") optimizer.minimize(final_loss) #优化器的训练参数如lr opt_var_name_list = optimizer.get_opti_var_name_list() if config.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() exe = Executor(place) #初始化 default_startup_program函数可以获取默认/全局 startup Program (初始化启动程序)。 exe.run(framework.default_startup_program()) #block0 表示一段代码的最外层块 param_list = main_program.block(0).all_parameters() param_name_list = [p.name for p in param_list] #TODO:init包含 init_model(config, param_name_list, place) processors = KnowledgeCorpus( data_dir=config.data_dir, data_prefix=config.data_prefix, vocab_path=config.vocab_path, min_len=config.min_len, max_len=config.max_len) #train_generator为yeild 生成函数 #进行了如下操作: #读取stream record file #对读入的文本进行tokennize;根据max min len 进行过滤 #并根据词汇表把src\tgt\cue文本串(chatpath+knowledge+":"+history\ response\ KG cue)转为数字串 ##进行padding并返回padding后的串和每个串的原长 train_generator = processors.train_generator( batch_size=config.batch_size, phase="train", shuffle=True) valid_generator = processors.data_generator( batch_size=config.batch_size, phase="dev", shuffle=False) model_handle = [exe, place, bow_loss, kl_loss, nll_loss, final_loss] #在训练过程中,每config.valid_steps 个batch(步)进行一次valid,并储存一次最好模型 train_loop(config, train_generator, valid_generator, main_program, inference_program, model_handle, param_name_list, opt_var_name_list)
def test(config): """ test """ batch_size = config.batch_size config.vocab_size = len(open(config.vocab_path).readlines()) final_score, final_ids, final_index = knowledge_seq2seq(config) final_score.persistable = True final_ids.persistable = True final_index.persistable = True main_program = fluid.default_main_program() if config.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) fluid.io.load_params(executor=exe, dirname=config.model_path, main_program=main_program) print("laod params finsihed") # test data generator processors = KnowledgeCorpus( data_dir=config.data_dir, data_prefix=config.data_prefix, vocab_path=config.vocab_path, min_len=config.min_len, max_len=config.max_len) test_generator = processors.data_generator( batch_size=config.batch_size, phase="test", shuffle=False) # load dict id_dict_array = load_id2str_dict(config.vocab_path) out_file = config.output fout = open(out_file, 'w') for batch_id, data in enumerate(test_generator()): data_feed, sent_num = build_data_feed(data, place, batch_size=batch_size) if data_feed is None: break out = exe.run(feed=data_feed, fetch_list=[final_score.name, final_ids.name, final_index.name]) batch_score = out[0] batch_ids = out[1] batch_pre_index = out[2] batch_score_arr = np.split(batch_score, batch_size, axis=1) batch_ids_arr = np.split(batch_ids, batch_size, axis=1) batch_pre_index_arr = np.split(batch_pre_index, batch_size, axis=1) index = 0 for (score, ids, pre_index) in zip(batch_score_arr, batch_ids_arr, batch_pre_index_arr): trace_ids, trace_score = trace_fianl_result(score, ids, pre_index, topk=1, EOS=3) fout.write(id_to_text(trace_ids[0][:-1], id_dict_array)) fout.write('\n') index += 1 if index >= sent_num: break fout.close()
class Model: """ This is an example model. It reads predefined dictionary and predict a fixed distribution. For a correct evaluation, each team should implement 3 functions: next_word_probability gen_response """ def __init__(self): """ Init whatever you need here """ vocab_file = 'data/vocab.txt' with codecs.open(vocab_file, 'r', 'utf-8') as f: vocab = [i.strip() for i in f.readlines() if len(i.strip()) != 0] self.vocab = vocab self.freqs = dict(zip(self.vocab[::-1], range(len(self.vocab)))) # Our code are as follows config = Config() torch.cuda.set_device(device=config.gpu) self.config = config # Data definition self.corpus = KnowledgeCorpus(data_dir=config.data_dir, data_prefix=config.data_prefix, min_freq=0, max_vocab_size=config.max_vocab_size, vocab_file=config.vocab_file, min_len=config.min_len, max_len=config.max_len, embed_file=config.embed_file, share_vocab=config.share_vocab) # Model definition self.model = Seq2Seq(src_vocab_size=self.corpus.SRC.vocab_size, tgt_vocab_size=self.corpus.TGT.vocab_size, embed_size=config.embed_size, hidden_size=config.hidden_size, padding_idx=self.corpus.padding_idx, num_layers=config.num_layers, bidirectional=config.bidirectional, attn_mode=config.attn, with_bridge=config.with_bridge, tie_embedding=config.tie_embedding, dropout=config.dropout, use_gpu=config.use_gpu) print(self.model) self.model.load(config.ckpt) # Generator definition self.generator = TopKGenerator(model=self.model, src_field=self.corpus.SRC, tgt_field=self.corpus.TGT, cue_field=self.corpus.CUE, beam_size=config.beam_size, max_length=config.max_dec_len, ignore_unk=config.ignore_unk, length_average=config.length_average, use_gpu=config.use_gpu) self.BOS = self.generator.BOS self.EOS = self.generator.EOS self.stoi = self.corpus.SRC.stoi self.itos = self.corpus.SRC.itos def next_word_probability(self, context, partial_out): """ Return probability distribution over next words given a partial true output. This is used to calculate the per-word perplexity. :param context: dict, contexts containing the dialogue history and personal profile of each speaker this dict contains following keys: context['dialog']: a list of string, dialogue histories (tokens in each utterances are separated using spaces). context['uid']: a list of int, indices to the profile of each speaker context['profile']: a list of dict, personal profiles for each speaker context['responder_profile']: dict, the personal profile of the responder :param partial_out: list, previous "true" words :return: a list, the first element is a dict, where each key is a word and each value is a probability score for that word. Unset keys assume a probability of zero. the second element is the probability for the EOS token e.g. context: { "dialog": [ ["How are you ?"], ["I am fine , thank you . And you ?"] ], "uid": [0, 1], "profile":[ { "loc":"Beijing", "gender":"male", "tag":"" }, { "loc":"Shanghai", "gender":"female", "tag":"" } ], "responder_profile":{ "loc":"Beijing", "gender":"male", "tag":"" } } partial_out: ['I', 'am'] ==> {'fine': 0.9}, 0.1 """ test_raw = self.read_data(context) test_data = self.corpus.build_examples(test_raw, data_type='test') dataset = Dataset(test_data) data_iter = dataset.create_batches(batch_size=1, shuffle=False, device=self.config.gpu) inputs = None for batch in data_iter: inputs = batch break partial_out_idx = [ self.stoi[s] if s in self.stoi.keys() else self.stoi['<unk>'] for s in partial_out ] # switch the model to evaluate mode self.model.eval() with torch.no_grad(): enc_outputs, dec_init_state = self.model.encode(inputs) long_tensor_type = torch.cuda.LongTensor if self.config.use_gpu else torch.LongTensor # Initialize the input vector input_var = long_tensor_type([self.BOS] * 1) # Inflate the initial hidden states to be of size: (1, H) dec_state = dec_init_state.inflate(1) for t in range(len(partial_out_idx)): # Run the RNN one step forward output, dec_state, attn = self.model.decode( input_var, dec_state) input_var = long_tensor_type([partial_out_idx[t]]) output, dec_state, attn = self.model.decode(input_var, dec_state) log_softmax_output = output.squeeze(1) log_softmax_output = log_softmax_output.cpu().numpy() prob_output = [math.exp(i) for i in log_softmax_output[0]] # The first 4 tokens are: '<pad>' '<unk>' '<bos>' '<eos>' freq_dict = {} for i in range(4, len(self.itos)): freq_dict[self.itos[i]] = prob_output[i] eos_prob = prob_output[3] return freq_dict, eos_prob def gen_response(self, contexts): """ Return a list of responses to each context. :param contexts: list, a list of context, each context is a dict that contains the dialogue history and personal profile of each speaker this dict contains following keys: context['dialog']: a list of string, dialogue histories (tokens in each utterances are separated using spaces). context['uid']: a list of int, indices to the profile of each speaker context['profile']: a list of dict, personal profiles for each speaker context['responder_profile']: dict, the personal profile of the responder :return: list, responses for each context, each response is a list of tokens. e.g. contexts: [{ "dialog": [ ["How are you ?"], ["I am fine , thank you . And you ?"] ], "uid": [0, 1], "profile":[ { "loc":"Beijing", "gender":"male", "tag":"" }, { "loc":"Shanghai", "gender":"female", "tag":"" } ], "responder_profile":{ "loc":"Beijing", "gender":"male", "tag":"" } }] ==> [['I', 'am', 'fine', 'too', '!']] """ test_raw = self.read_data(contexts[0]) test_data = self.corpus.build_examples(test_raw, data_type='test') dataset = Dataset(test_data) data_iter = dataset.create_batches(batch_size=1, shuffle=False, device=self.config.gpu) results = self.generator.generate(batch_iter=data_iter) res = [result.preds[0].split(" ") for result in results] return res @staticmethod def read_data(dialog): history = dialog["dialog"] uid = [int(i) for i in dialog["uid"]] if "responder_profile" in dialog.keys(): responder_profile = dialog["responder_profile"] elif "response_profile" in dialog.keys(): responder_profile = dialog["response_profile"] else: raise ValueError("No responder_profile or response_profile!") src = "" for idx, sent in zip(uid, history): sent_content = sent[0] src += sent_content src += ' ' src = src.strip() tgt = "NULL" filter_knowledge = [] if type(responder_profile["tag"]) is list: filter_knowledge.append(' '.join( responder_profile["tag"][0].split(';'))) else: filter_knowledge.append(' '.join( responder_profile["tag"].split(';'))) filter_knowledge.append(responder_profile["loc"]) data = [{'src': src, 'tgt': tgt, 'cue': filter_knowledge}] return data
def train(config): """ model training """ config.vocab_size = len(open(config.vocab_path).readlines()) #计算行数,即词的个数 bow_loss, kl_loss, nll_loss, final_loss= knowledge_seq2seq(config) bow_loss.persistable = True kl_loss.persistable = True nll_loss.persistable = True final_loss.persistable = True main_program = fluid.default_main_program() inference_program = fluid.default_main_program().clone(for_test=True) #Program是Paddle Fluid对于计算图的一种静态描述 #1. Program.clone() 方法不会克隆例如 DataLoader 这样的数据读取相关的部分,这可能会造成的数据读取部分在克隆后丢失 #2. 此API当 for_test=True 时将会裁剪部分OP和变量。为防止错误的裁剪,推荐在 append_backward 和执行优化器之前使用 clone(for_test=True) 。 #当 for_test=True 时创建一个新的、仅包含当前Program前向内容的Program。否则创建一个新的,和当前Program完全相同的Program fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=config.grad_clip)) optimizer = fluid.optimizer.Adam(learning_rate=config.lr) #给指定参数做梯度裁剪。 # 此API对位置使用的要求较高,其必须位于组建网络之后, minimize 之前,因此在未来版本中可能被删除,故不推荐使用。 # 推荐在 optimizer 初始化时设置梯度裁剪。 有三种裁剪策略: GradientClipByGlobalNorm 、 GradientClipByNorm 、 # GradientClipByValue 。 如果在 optimizer 中设置过梯度裁剪,又使用了 set_gradient_clip ,set_gradient_clip 将不会生效。 #深度神经网络采用链式法则进行参数求导的方式并不是绝对安全的,有时会出现梯度消失或者梯度爆炸的情况。 # 其中梯度爆炸问题的常见应对方式为“梯度裁剪”,也就是通过“clip”方式来防止迭代中梯度值过大。 if config.stage == 0: print("stage 0") optimizer.minimize(bow_loss) else: print("stage 1") optimizer.minimize(final_loss) #指定优化器最小化的loss opt_var_name_list = optimizer.get_opti_var_name_list() if config.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() #Gpu or Cpu exe = Executor(place) exe.run(framework.default_startup_program()) #执行程序 param_list = main_program.block(0).all_parameters() #Block 是高级语言中变量作用域的概念,在编程语言中,Block是一对大括号,其中包含局部变量定义和一系列指令或 # 操作符。编程语言中的控制流结构 if-else 和 for 在深度学习中可以被等效为: # Fluid 中的 Block 描述了一组以顺序、选择或是循环执行的 Operator 以及 Operator 操作的对象:Tensor。 param_name_list = [p.name for p in param_list] #模型列表 init_model(config, param_name_list, place) processors = KnowledgeCorpus( data_dir=config.data_dir, data_prefix=config.data_prefix, vocab_path=config.vocab_path, min_len=config.min_len, max_len=config.max_len) #知识库对象 train_generator = processors.data_generator( batch_size=config.batch_size, phase="train", shuffle=True) #训练数据生成器 valid_generator = processors.data_generator( batch_size=config.batch_size, phase="dev", shuffle=False) #验证数据生成器 model_handle = [exe, place, bow_loss, kl_loss, nll_loss, final_loss] train_loop(config, train_generator, valid_generator, main_program, inference_program, model_handle, param_name_list, opt_var_name_list)
place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) #类似C,起运行作用,exe已经说明了意思 fluid.io.load_params(executor=exe, dirname=config.model_path, main_program=main_program) #参数: #executor (Executor) – 加载模型参数的 executor (详见 执行引擎 ) 。 #dirname (str) – 模型参数的存储路径。 #main_program (Program,可选) – 筛选模型参数变量所依据的 Program (详见 基础概念 )。若为None, 则使用全局默认的 default_main_program 。默认值为None。 #filename (str,可选) – 若模型参数是以若干文件形式存储在 dirname 指定的目录下,则设置 filename 值为None。反之,需要通过 filename 来指明单一模型参数存储文件的名称。 默认值为None。 processors = KnowledgeCorpus( data_dir=config.data_dir, data_prefix=config.data_prefix, vocab_path=config.vocab_path, min_len=config.min_len, max_len=config.max_len) # load dict id_dict_array = load_id2str_dict(config.vocab_path) model_handle = [exe, place, final_score, final_ids, final_index, processors, id_dict_array] #返回一个包含以上成分的列表 return model_handle def predict(model_handle, text): """ predict for text by model_handle """ batch_size = 1