def evaluate(sess, model, name, data, id_to_tag, logger): """ evaluate F1 based on dev dataset and test dataset """ logger.info("evaluate:{}".format(name)) # name: dev/test ner_results = model.evaluate( sess, data, id_to_tag) # ner_results dimension: eval/test样本数量*每句实际长度 # 每个节点是字符串: 字符 正确标签 预测标签 eval_lines = test_ner(ner_results, FLAGS.result_path) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) if name == "dev": best_test_f1 = model.best_dev_f1.eval() if f1 > best_test_f1: tf.assign(model.best_dev_f1, f1).eval() logger.info("new best dev f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1 elif name == "test": best_test_f1 = model.best_test_f1.eval() if f1 > best_test_f1: tf.assign(model.best_test_f1, f1).eval() logger.info("new best test f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1
def evaluate(sess, model, name, data, id_to_tag, logger): """ evaluate the model. :param sess: tf.Session() :param model: model :param name: "dev" or "test" :param data: dev data or test data :param id_to_tag: :param logger: :return: """ logger.info("evaluate:{}".format(name)) ner_results = model.evaluate(sess, data, id_to_tag) # get the result eval_lines = test_ner(ner_results, FLAGS.result_path) # run the conlleval for line in eval_lines: logger.info(line) # get the F1 value f1 = float(eval_lines[1].strip().split()[-1]) if name == "dev": best_test_f1 = model.best_dev_f1.eval() if f1 > best_test_f1: tf.assign(model.best_dev_f1, f1).eval() logger.info("new best dev f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1, f1 elif name == "test": best_test_f1 = model.best_test_f1.eval() if f1 > best_test_f1: tf.assign(model.best_test_f1, f1).eval() logger.info("new best test f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1, f1
def eval(self, sess, mode, data_manager): logger.info("=> Evaluate mode: {}".format(mode)) # evaluate result ner_res = self.model.evaluate(sess, data_manager, self.id_2_tag) report = utils.test_ner(ner_res, FLAGS.result_path) for line in report: logger.info(line) # score: find best score model on dev set!!! f1 = float(report[1].strip().split()[-1]) if mode == "dev": best_test_f1 = self.model.best_dev_f1.eval() if f1 > best_test_f1: tf.assign(self.model.best_dev_f1, f1).eval() logger.info(">>> new best dev f1 score:{:>.3f}".format(f1)) self.model.save_model(sess, FLAGS.ckpt_path, name="best_score.ckpt") logger.info(">>> best model saved. ") return f1 > best_test_f1 elif mode == "test": best_test_f1 = self.model.best_test_f1.eval() if f1 > best_test_f1: tf.assign(self.model.best_test_f1, f1).eval() logger.info(">>> !!! Test f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1
def CRF_eval(data, test_index, y_pred, path, self_eval): test_char = [data[i] for i in test_index] if self_eval: datawpred = [[[data[0], data[-1]] + [pred] for data, pred in zip(test_char[j], y_pred[j])] for j in range(len(y_pred))] else: datawpred = [[[data, pred] for data, pred in zip(test_char[j], y_pred[j])] for j in range(len(y_pred))] with open(path + "pred{}.conll".format(self_eval != True), 'w', encoding='utf-8') as f: write_conll(f, input_data_transform(datawpred)) if self_eval: test_ner(path)
def evaluate(sess, model, name, data, id_tag): ner_results = model.evaluate(sess, data, id_tag) #这一批验证数据的句子和每个字真实的tag和预测的tag eval_lines = test_ner( ner_results, FLAGS.result_path) #将验证结果写入文件中(ner_results),然后计算得到F1等批判模型好坏的指标 f1 = float(eval_lines[1].strip().split()[-1]) #截取出f1指标 return f1
def evaluate(sess, model, name, data_manager, id_to_tag, logger, config): logger.info("evaluate:{}".format(name)) ner_results = evaluate_(sess, model, data_manager, id_to_tag, config) eval_lines = test_ner(ner_results, config.result_path) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) if name == "dev": best_test_f1 = model.best_dev_f1.eval() if f1 > best_test_f1: tf.assign(model.best_dev_f1, f1).eval() logger.info("new best dev f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1
def eval_model(id_to_char, id_to_tag, test_manager, device, model_name=None): print("Eval ......") if not model_name: model_name = args.log_name old_weights = np.random.rand(len(id_to_char), args.word_embed_dim) pre_word_embed = load_word2vec("100.utf8", id_to_char, args.word_embed_dim, old_weights) e_model = Model(args, id_to_tag, device, pre_word_embed).to(device) e_model.load_state_dict(torch.load("./models/" + model_name + ".pkl")) print("model loaded ...") e_model.eval() all_results = [] for batch in test_manager.iter_batch(): strs, lens, chars, segs, subtypes, tags, adj, dep = batch chars = torch.LongTensor(chars).to(device) _lens = torch.LongTensor(lens).to(device) subtypes = torch.LongTensor(subtypes).to(device) tags = torch.LongTensor(tags).to(device) adj = torch.FloatTensor(adj).to(device) dep = torch.LongTensor(dep).to(device) logits, _ = e_model(chars, _lens, subtypes, adj, dep) """ Evaluate """ # Decode batch_paths = [] for index in range(len(logits)): length = lens[index] score = logits[index][:length] # [seq, dim] probs = F.softmax(score, dim=-1) # [seq, dim] path = torch.argmax(probs, dim=-1) # [seq] batch_paths.append(path) for i in range(len(strs)): result = [] string = strs[i][:lens[i]] gold = iobes_iob([id_to_tag[int(x)] for x in tags[i][:lens[i]]]) pred = iobes_iob( [id_to_tag[int(x)] for x in batch_paths[i][:lens[i]]]) for char, gold, pred in zip(string, gold, pred): result.append(" ".join([char, gold, pred])) all_results.append(result) all_eval_lines = test_ner(all_results, args.result_path, args.log_name) res_info = all_eval_lines[1].strip() f1 = float(res_info.split()[-1]) print("eval: f1: {}".format(f1)) return f1, res_info
def evaluate_ner(sess, model, name, data, id_to_tag_ner, logger): logger.info("evaluate_ner:{}".format(name)) ner_results_ner = model.evaluate_ner(sess, data, id_to_tag_ner) eval_lines = test_ner(ner_results_ner, FLAGS.result_path) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) if name == "dev_ner": best_test_f1 = model.best_dev_f1.eval() if f1 > best_test_f1: tf.assign(model.best_dev_f1, f1).eval() logger.info("new best dev_ner f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1 elif name == "test_ner": best_test_f1 = model.best_test_f1.eval() if f1 > best_test_f1: tf.assign(model.best_test_f1, f1).eval() logger.info("new best test_ner f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1
def evaluate(sess, model, name, data, id_to_tag, logger): logger.info("evaluate:{}".format(name)) ner_results = model.evaluate(sess, data, id_to_tag) eval_lines = test_ner(ner_results, FLAGS.result_path) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) if name == "dev": best_test_f1 = model.best_dev_f1.eval() if f1 > best_test_f1: tf.assign(model.best_dev_f1, f1).eval() logger.info("new best dev f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1 elif name == "test": best_test_f1 = model.best_test_f1.eval() if f1 > best_test_f1: tf.assign(model.best_test_f1, f1).eval() logger.info("new best test f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1
def evaluate(sess, model, name, data, id_to_tag, logger): logger.info("evaluate:{}".format(name)) #ner_result[0]: ['14313 B-c B-c', '10243 I-c I-c', '19167 I-c I-c', '19936 O O', '15274 O O'] ner_results = model.evaluate(sess, data, id_to_tag) eval_lines = test_ner(ner_results, FLAGS.result_path) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) if name == "dev": best_test_f1 = model.best_dev_f1.eval() if f1 > best_test_f1: tf.assign(model.best_dev_f1, f1).eval() logger.info("new best dev f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1 elif name == "test": best_test_f1 = model.best_test_f1.eval() if f1 > best_test_f1: tf.assign(model.best_test_f1, f1).eval() logger.info("new best test f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1
def dev_epoch(epoch, model, dev_manager, id_to_tag, device): # dev model.eval() all_results = [] for batch in dev_manager.iter_batch(): strs, lens, chars, segs, subtypes, tags, adj, dep = batch chars = torch.LongTensor(chars).to(device) _lens = torch.LongTensor(lens).to(device) subtypes = torch.LongTensor(subtypes).to(device) tags = torch.LongTensor(tags).to(device) adj = torch.FloatTensor(adj).to(device) dep = torch.LongTensor(dep).to(device) logits, _ = model(chars, _lens, subtypes, adj, dep) # [batch, seq, dim] """ Evaluate """ # Decode batch_paths = [] for index in range(len(logits)): length = lens[index] score = logits[index][:length] # [seq, dim] probs = F.softmax(score, dim=-1) # [seq, dim] path = torch.argmax(probs, dim=-1) # [seq] batch_paths.append(path) for i in range(len(strs)): result = [] string = strs[i][:lens[i]] gold = iobes_iob([id_to_tag[int(x)] for x in tags[i][:lens[i]]]) pred = iobes_iob( [id_to_tag[int(x)] for x in batch_paths[i][:lens[i]]]) for char, gold, pred in zip(string, gold, pred): result.append(" ".join([char, gold, pred])) all_results.append(result) all_eval_lines = test_ner(all_results, args.result_path, args.log_name) log_handler.info("epoch: {}, info: {}".format(epoch + 1, all_eval_lines[1].strip())) f1 = float(all_eval_lines[1].strip().split()[-1]) return f1, model
def evaluate(sess, model, name, data, id_to_tag, logger,iter_times): logger.info("evaluate:{}".format(name)) ner_results = model.evaluate(sess, data, id_to_tag) eval_lines = test_ner(ner_results, FLAGS.result_path) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) if name == "dev": best_test_f1 = model.best_dev_f1.eval() if f1 > best_test_f1: tf.assign(model.best_dev_f1, f1).eval() logger.info("new best dev f1 score:{:>.3f}".format(f1)) elif iter_times==100: logger.info("训练完成,最佳的F值为:{:>.3f}".format(best_test_f1)) return f1 > best_test_f1 elif name == "test": best_test_f1 = model.best_test_f1.eval() if f1 > best_test_f1: tf.assign(model.best_test_f1, f1).eval() logger.info("new best test f1 score:{:>.3f}".format(f1)) return f1 > best_test_f1
def evaluate(sess, model, name, data, id_to_tag, logger): # 拿到对应的一个批次测试结果集 ner_results = model.evaluate(sess, data, id_to_tag) # 预测结果保存到结果集 eval_lines = test_ner(ner_results, FLAGS.result_path) # 这里是打印报告结果 for line in eval_lines: logger.info(line) # 这里就拿到F1值 f1 = float(eval_lines[1].strip().split()[-1]) # 这里返回最佳的F1值 if name == "dev": best_test_f1 = model.best_dev_f1.eval() if f1 > best_test_f1: tf.assign(model.best_dev_f1, f1).eval() logger.info("new best dev f1 score:{:>.3f}".format(f1)) return f1 elif name == "test": best_test_f1 = model.best_test_f1.eval() if f1 > best_test_f1: tf.assign(model.best_test_f1, f1).eval() logger.info("new best test f1 score:{:>.3f}".format(f1)) return f1
def eval_step(): global max_a ner_results = [] correct = 0 total = 0 with torch.no_grad(): for x, y, seq_length in zip(VX, VY, seq_lengths_V): x = torch.tensor(x) y = torch.tensor(y) # model.hidden = model.init_hidden() _, predicted = model(x, seq_length) for bt in range(MB_SIZE): bt_list = predicted[bt].tolist() lenth = len(bt_list) - torch.t( y)[bt].numpy().tolist().count(0) total += lenth correct += (torch.tensor( bt_list[0:lenth]) == torch.t(y)[bt][0:lenth] ).sum().item() block = [] for c in range(lenth): block.append(id2char[torch.t(x)[bt][c].tolist()] + ' ' + id2tag[bt_list[c]] + ' ' + id2tag[torch.t(y)[bt][c].tolist()]) ner_results.append(block) eval_lines = test_ner(ner_results, '') a = float(eval_lines[1].strip().split()[-1]) # a = 100.0 * correct / total print 'acc = %.2f' % a if a > max_a: # torch.save(model, 'model' + '_%.2f' % a + '.pkl') max_a = a
def main(): # load data sets global args args = parser.parse_args() pp.pprint(vars(args)) # running_name = 'X' use_cuda = cuda_model.ifUseCuda(args.gpu_id, args.multiGpu) # use_cuda = False # train_file = 'data/example.train' # dev_file = 'data/example.dev' test_file = 'data/example.test' # embedding_file = 'data/vec.txt' map_file = 'map.pkl' # config_file = 'config_file_pytorch' tag_file = 'tag.pkl' # embedding_easy_file = 'data/easy_embedding.npy' # train_sentences = load_sentences(train_file) # dev_sentences = load_sentences(dev_file) test_sentences = load_sentences(test_file) # train_sentences = dev_sentences # update_tag_scheme(train_sentences, args.tag_schema) update_tag_scheme(test_sentences, args.tag_schema) # update_tag_scheme(dev_sentences, args.tag_schema) if not os.path.isfile(tag_file): print("Tag file {:s} Not found".format(tag_file)) sys.exit(-1) else: with open(tag_file, 'rb') as t: tag_to_id, id_to_tag = pickle.load(t) if not os.path.isfile(map_file): print("Map file {:s} Not found".format(map_file)) # create dictionary for word # dico_chars_train = char_mapping(train_sentences)[0] # dico_chars, char_to_id, id_to_char = augment_with_pretrained( # dico_chars_train.copy(), # embedding_file, # list(itertools.chain.from_iterable( # [[w[0] for w in s] for s in test_sentences]) # ) # ) # # _, tag_to_id, id_to_tag = tag_mapping(train_sentences) # # with open(map_file, "wb") as f: # pickle.dump([char_to_id, id_to_char], f) else: with open(map_file, "rb") as f: char_to_id, id_to_char = pickle.load(f) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) print("{:d} sentences in test.".format(len(test_data))) test_manager = BatchManager(test_data, 1) save_places = dir_utils.save_places(args.eval) # log_path = os.path.join("log", FLAGS.log_file) logger = get_logger( os.path.join(save_places.log_save_dir, 'evaluation-{:d}.txt'.format(args.fileid))) config = config_model(char_to_id, tag_to_id, args) print_config(config, logger) logger.info("start training") #Update: create model and embedding! model = NERModel.CNERPointer(char_dim=args.char_dim, seg_dim=args.seg_dim, hidden_dim=args.hidden_dim, max_length=15, output_classes=4, dropout=args.dropout, embedding_path=None, id_to_word=id_to_char, easy_load=None) print("Number of Params\t{:d}".format( sum([p.data.nelement() for p in model.parameters()]))) #Update: this won't work! # model = cuda_model.convertModel2Cuda(model, gpu_id=args.gpu_id, multiGpu=args.multiGpu) if use_cuda: model = model.cuda() model.eval() if args.eval is not None: # if os.path.isfile(args.resume): ckpt_filename = os.path.join( save_places.model_save_dir, 'checkpoint_{:04d}.pth.tar'.format(args.fileid)) assert os.path.isfile( ckpt_filename), 'Error: no checkpoint directory found!' checkpoint = torch.load(ckpt_filename, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict'], strict=True) train_iou = checkpoint['IoU'] print("=> loading checkpoint '{}', current iou: {:.04f}".format( ckpt_filename, train_iou)) ner_results = evaluate(model, test_manager, id_to_tag, use_cuda, max_len=5) eval_lines = test_ner(ner_results, save_places.summary_save_dir) for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) return f1
def main(_): if not os.path.isdir(FLAGS.log_path): os.makedirs(FLAGS.log_path) if not os.path.isdir(FLAGS.model_path): os.makedirs(FLAGS.model_path) if not os.path.isdir(FLAGS.result_path): os.makedirs(FLAGS.result_path) tag_to_id = { "O": 0, "B-LOC": 1, "I-LOC": 2, "B-PER": 3, "I-PER": 4, "B-ORG": 5, "I-ORG": 6 } # load data id_to_word, id_to_tag, train_data, dev_data, test_data = load_data( FLAGS, tag_to_id) train_manager = BatchManager(train_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.batch_size) dev_manager = BatchManager(dev_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.valid_batch_size) test_manager = BatchManager(test_data, len(id_to_tag), FLAGS.word_max_len, FLAGS.valid_batch_size) with tf.Session() as sess: model = create_model(sess, id_to_word, id_to_tag) loss = 0 best_test_f1 = 0 steps_per_epoch = len(train_data) // FLAGS.batch_size + 1 for _ in range(FLAGS.max_epoch): iteration = (model.global_step.eval()) // steps_per_epoch + 1 train_manager.shuffle() for batch in train_manager.iter_batch(): global_step = model.global_step.eval() step = global_step % steps_per_epoch batch_loss = model.run_step(sess, True, batch) loss += batch_loss / FLAGS.steps_per_checkpoint if global_step % FLAGS.steps_per_checkpoint == 0: model.logger.info( "iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step, steps_per_epoch, loss)) loss = 0 model.logger.info("validating ner") ner_results = model.predict(sess, dev_manager) eval_lines = test_ner(ner_results, FLAGS.result_path) for line in eval_lines: model.logger.info(line) test_f1 = float(eval_lines[1].strip().split()[-1]) if test_f1 > best_test_f1: best_test_f1 = test_f1 model.logger.info("new best f1 score:{:>.3f}".format(test_f1)) model.logger.info("saving model ...") checkpoint_path = os.path.join(FLAGS.model_path, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) # test model model.logger.info("testing ner") ckpt = tf.train.get_checkpoint_state(FLAGS.model_path) model.logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) ner_results = model.predict(sess, test_manager) eval_lines = test_ner(ner_results, FLAGS.result_path) for line in eval_lines: model.logger.info(line)