def init(): path = config.data_path config.embedding_file = os.path.join(path, config.embedding_file) config.train_file = os.path.join(path, config.train_file) config.dev_file = os.path.join(path, config.dev_file) config.test_file = os.path.join(path, config.test_file) dim = utils.get_dim(config.embedding_file) config.embedding_size = dim # Config log if config.log_file is None: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M') else: logging.basicConfig(filename=config.log_file, filemode='w', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M') # Load data logging.info('-' * 50) logging.info('Load data files..') if config.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(config.train_file, 1000) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(config.dev_file, 100) else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(config.train_file) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(config.dev_file) config.num_train = len(train_examples[0]) config.num_dev = len(dev_examples[0]) # Build dictionary logging.info('-' * 50) logging.info('Build dictionary..') word_dict = utils.build_dict(train_examples[0] + train_examples[1]) entity_markers = list(set( [w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2] )) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} logging.info('Entity markers: %d' % len(entity_dict)) config.num_labels = len(entity_dict) logging.info('-' * 50) logging.info('Load embedding file..') embeddings = utils.gen_embeddings(word_dict, config.embedding_size, config.embedding_file) (config.vocab_size, config.embedding_size) = embeddings.shape # Log parameters flags = config.__dict__['__flags'] flag_str = "\n" for k in flags: flag_str += "\t%s:\t%s\n" % (k, flags[k]) logging.info(flag_str) # Vectorize test data logging.info('-' * 50) logging.info('Vectorize test data..') # d: document, q: question, a:answer # l: whether the entity label occurs in the document dev_d, dev_q, dev_l, dev_a = utils.vectorize(dev_examples, word_dict, entity_dict) assert len(dev_d) == config.num_dev all_dev = utils.gen_examples(dev_d, dev_q, dev_l, dev_a, config.batch_size) if config.test_only: return embeddings, all_dev, None # Vectorize training data logging.info('-' * 50) logging.info('Vectorize training data..') train_d, train_q, train_l, train_a = utils.vectorize(train_examples, word_dict, entity_dict) assert len(train_d) == config.num_train all_train = utils.gen_examples(train_d, train_q, train_l, train_a, config.batch_size) return embeddings, all_dev, all_train
def main(args): # code.interact(local=locals()) # 1.加载数据 # 加载句子 train_en, train_cn = utils.load_data(args.train_file) dev_en, dev_cn = utils.load_data(args.dev_file) # 参数存储 args.num_train = len(train_en) args.num_dev = len(dev_en) # 2.构建单词字典 if os.path.isfile(args.vocab_file): en_dict, cn_dict, en_total_words, cn_total_words = pickle.load( open(args.vocab_file, "rb")) else: # 获取字典 en_dict, en_total_words = utils.build_dict(train_en) cn_dict, cn_total_words = utils.build_dict(train_cn) pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words], open(args.vocab_file, "wb")) # 参数存储 args.en_total_words = en_total_words args.cn_total_words = cn_total_words # 翻转字典,转换为数字->单词 inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()} # 编码单词,单词->数字 train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict) dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict) # convert to numpy tensors train_data = utils.gen_examples(train_en, train_cn, args.batch_size) dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size) # 初始化模型 if os.path.isfile(args.model_file): model = torch.load(args.model_file) elif args.model == "EncoderDecoderModel": model = EncoderDecoderModel(args) if args.use_cuda: model = model.cuda() # 交叉熵loss函数 crit = utils.LanguageModelCriterion() # 指标评估 print("start evaluating on dev...") correct_count, loss, num_words = eval(model, dev_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss)) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) best_acc = acc # 定义学习率 learning_rate = args.learning_rate # 定义优化器 optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate) total_num_sentences = 0. total_time = 0. for epoch in range(args.num_epoches): np.random.shuffle(train_data) total_train_loss = 0. total_num_words = 0. # 获取训练数据和序列下标 for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in tqdm(enumerate(train_data)): # 获取mini batch size batch_size = mb_x.shape[0] total_num_sentences += batch_size # 将numpy的tensor数据类型转换为torch的tensor,再套上variable mb_x = Variable(torch.from_numpy(mb_x)).long() mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long() # LSTM隐层state hidden = model.init_hidden(batch_size) # 预测句子的给定前缀 mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long() # 预测句子的目标后缀 mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long() mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:])) if args.use_cuda: mb_x = mb_x.cuda() mb_x_mask = mb_x_mask.cuda() mb_input = mb_input.cuda() mb_out = mb_out.cuda() mb_out_mask = mb_out_mask.cuda() # 模型预测函数 mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden) # 交叉熵损失函数衡量pred和out差距 loss = crit(mb_pred, mb_out, mb_out_mask) num_words = torch.sum(mb_out_mask).data[0] total_train_loss += loss.data[0] * num_words total_num_words += num_words # 更新模型 # 首先清空模型梯度数据 optimizer.zero_grad() # 计算loss对parameter的梯度 loss.backward() # 实行梯度下降 optimizer.step() # 打印loss值 print("training loss: %f" % (total_train_loss / total_num_words)) # 评估每一轮迭代 if (epoch + 1) % args.eval_epoch == 0: print("start evaluating on dev...") # 获取参数 correct_count, loss, num_words = eval(model, dev_data, args, crit) # 计算损失和准确率 loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss)) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) # 存储最优准确率模型 if acc >= best_acc: torch.save(model, args.model_file) best_acc = acc print("model saved...") else: learning_rate *= 0.5 optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate) # 打印最佳准确率 print("best dev accuracy: %f" % best_acc) print("#" * 60) # 加载数据 test_en, test_cn = utils.load_data(args.test_file) args.num_test = len(test_en) test_en, test_cn = utils.encode(test_en, test_cn, en_dict, cn_dict) test_data = utils.gen_examples(test_en, test_cn, args.batch_size) # 测试集评估 correct_count, loss, num_words = eval(model, test_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("test loss %s" % (loss)) print("test accuracy %f" % (acc)) print("test total number of words %f" % (num_words)) # 训练集评估 correct_count, loss, num_words = eval(model, train_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("train loss %s" % (loss)) print("train accuracy %f" % (acc))