def evaluate(model, criterion, data_loader, file_path, mode): """ mode eval: eval on development set and compute P/R/F1, called between training. mode predict: eval on development / test set, then write predictions to \ predict_test.json and predict_test.json.zip \ under args.data_path dir for later submission or evaluation. """ example_all = [] with open(file_path, "r", encoding="utf-8") as fp: for line in fp: example_all.append(json.loads(line)) id2spo_path = os.path.join(os.path.dirname(file_path), "id2spo.json") with open(id2spo_path, 'r', encoding='utf8') as fp: id2spo = json.load(fp) model.eval() loss_all = 0 eval_steps = 0 formatted_outputs = [] current_idx = 0 for batch in tqdm(data_loader, total=len(data_loader)): eval_steps += 1 input_ids, seq_len, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch logits = model(input_ids=input_ids) mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and( (input_ids != 2)) loss = criterion(logits, labels, mask) loss_all += loss.numpy().item() probs = F.sigmoid(logits) logits_batch = probs.numpy() seq_len_batch = seq_len.numpy() tok_to_orig_start_index_batch = tok_to_orig_start_index.numpy() tok_to_orig_end_index_batch = tok_to_orig_end_index.numpy() formatted_outputs.extend( decoding(example_all[current_idx:current_idx + len(logits)], id2spo, logits_batch, seq_len_batch, tok_to_orig_start_index_batch, tok_to_orig_end_index_batch)) current_idx = current_idx + len(logits) loss_avg = loss_all / eval_steps print("eval loss: %f" % (loss_avg)) if mode == "predict": predict_file_path = os.path.join(args.data_path, 'predictions.json') else: predict_file_path = os.path.join(args.data_path, 'predict_eval.json') predict_zipfile_path = write_prediction_results(formatted_outputs, predict_file_path) if mode == "eval": precision, recall, f1 = get_precision_recall_f1( file_path, predict_zipfile_path) os.system('rm {} {}'.format(predict_file_path, predict_zipfile_path)) return precision, recall, f1 elif mode != "predict": raise Exception("wrong mode for eval func")
def evaluate(model, criterion, data_loader, test_loss, file_path, mode): """ mode eval: eval on development set and compute P/R/F1, called between training. mode predict: eval on development / test set, then write predictions to \ predict_test.json and predict_test.json.zip \ under args.data_path dir for later submission or evaluation. """ probs_all = None seq_len_all = None tok_to_orig_start_index_all = None tok_to_orig_end_index_all = None loss_all = 0 eval_steps = 0 for batch in tqdm(data_loader): eval_steps += 1 input_ids, seq_len, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch logits = model(input_ids=input_ids) mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and((input_ids != 2)) loss = criterion((logits, labels, mask)) loss_all += test_loss(loss).result() probs = logits if probs_all is None: probs_all = probs.numpy() seq_len_all = seq_len.numpy() tok_to_orig_start_index_all = tok_to_orig_start_index.numpy() tok_to_orig_end_index_all = tok_to_orig_end_index.numpy() else: probs_all = np.append(probs_all, probs.numpy(), axis=0) seq_len_all = np.append(seq_len_all, seq_len.numpy(), axis=0) tok_to_orig_start_index_all = np.append( tok_to_orig_start_index_all, tok_to_orig_start_index.numpy(), axis=0) tok_to_orig_end_index_all = np.append( tok_to_orig_end_index_all, tok_to_orig_end_index.numpy(), axis=0) loss_avg = loss_all / eval_steps print("eval loss: %f" % (loss_avg)) id2spo_path = os.path.join(os.path.dirname(file_path), "id2spo.json") with open(id2spo_path, 'r', encoding='utf8') as fp: id2spo = json.load(fp) formatted_outputs = decoding(file_path, id2spo, probs_all, seq_len_all, tok_to_orig_start_index_all, tok_to_orig_end_index_all) if mode == "predict": predict_file_path = os.path.join(args.data_path, 'predictions.json') else: predict_file_path = os.path.join(args.data_path, 'predict_eval.json') predict_zipfile_path = write_prediction_results(formatted_outputs, predict_file_path) if mode == "eval": precision, recall, f1 = get_precision_recall_f1(file_path, predict_zipfile_path) os.system('rm {} {}'.format(predict_file_path, predict_zipfile_path)) return precision, recall, f1 elif mode != "predict": raise Exception("wrong mode for eval func")
def __test__(self, epoch, text_seqs, class_list): assert len(text_seqs) == len(class_list) start_time = time.time() step_time = time.time() test_steps = len(text_seqs) // config.batch_size if test_steps * config.batch_size < len(text_seqs): test_steps += 1 # topk_list = list() pred_all = np.array([]) out_confidence_all = np.array([]) all_loss = np.zeros(1) for cstep in range(test_steps): text_seqs_mini = text_seqs[cstep * config.batch_size:min( (cstep + 1) * config.batch_size, len(text_seqs))] class_idx_mini = class_list[cstep * config.batch_size:min( (cstep + 1) * config.batch_size, len(text_seqs))] encode_seqs_id_mini, encode_seqs_mat_mini = self.prepro_encode( text_seqs_mini) # pred_mat = np.zeros([config.batch_size, len(self.class_dict)]) test_loss, out = self.sess.run( [ self.model.test_loss, self.model.test_net.outputs, ], feed_dict={ self.model.encode_seqs: encode_seqs_mat_mini, self.model.label_logits: np.array(class_idx_mini).reshape(-1, 1) }) all_loss[0] += test_loss pred = np.array( [1.0 if x[0] >= self.model.threshold else 0.0 for x in out]) out_confidence = np.array([x[0] for x in out]) # pred = np.array([_ / np.sum(_) for _ in np.exp(out)]) # for i in range(len(self.seen_class)): # pred_mat[:, self.full_class_map2index[self.seen_class[i]]] = pred[:, i] # topk = self.get_pred_class_topk(pred_mat, k=1) # topk_list.append(topk) pred_all = np.concatenate((pred_all, pred), axis=0) out_confidence_all = np.concatenate( (out_confidence_all, out_confidence), axis=0) if cstep % config.cstep_print == 0 and cstep > 0: # tmp_topk = np.concatenate(topk_list, axis=0) # tmp_topk = self.get_one_hot_results(np.array(tmp_topk[: (cstep + 1) * config.batch_size])) # tmp_gt = self.get_one_hot_results(np.reshape(np.array(class_list[ : (cstep + 1) * config.batch_size]), newshape=(-1, 1))) # tmp_stats = utils.get_statistics(tmp_topk, tmp_gt, single_label_pred=True) tmp_stats = utils.get_precision_recall_f1( pred_all, np.array(class_list[:len(pred_all)]), with_confusion_matrix=True) print( "[Test] Epoch: [%3d][%4d/%4d] time: %.4f, loss: %s, threshold: %.4f \n %s" % (epoch, cstep, test_steps, time.time() - step_time, all_loss / (cstep + 1), self.model.threshold, utils.dict_to_string_4_print(tmp_stats))) step_time = time.time() # prediction_topk = np.concatenate(topk_list, axis=0) # prediction_topk = self.get_one_hot_results(np.array(prediction_topk[: test_steps * config.batch_size])) # ground_truth = self.get_one_hot_results(np.reshape(np.array(class_list[: test_steps * config.batch_size]), newshape=(-1, 1))) stats = utils.get_precision_recall_f1(pred_all, np.array(class_list), with_confusion_matrix=True) print( "[Test Sum] Epoch: [%3d] time: %.4f, loss: %s, threshold: %.4f \n %s" % (epoch, time.time() - start_time, all_loss / test_steps, self.model.threshold, utils.dict_to_string_4_print(stats))) return stats, pred_all, class_list, out_confidence_all
for idx, gt in enumerate(test_class_list): accepted_classes = [ seen_classes[cid] for cid, x in enumerate(pred_list) if x[idx] == 1.0 ] prediction.append(accepted_classes) if len(accepted_classes) > 0: accepted.append(1) else: accepted.append(0) if gt in seen_classes: gt_accepted.append(1) else: gt_accepted.append(0) accepted_stats = utils.get_precision_recall_f1( np.array(accepted), np.array(gt_accepted), with_confusion_matrix=True) pass_to_phase2.append(accepted) avg_classifier_stat = dict() for key in stat_list[0]: if key != 'texts_accepted_from_class': avg_classifier_stat[key] = sum( [stat[key] for stat in stat_list]) / len(stat_list) summary_dict = { 'iteration': i, 'seen_classes': seen_classes, 'unseen_classes': unseen_classes, 'test_class_list': test_class_list, 'stat_list': stat_list,
def evaluate(model, criterion, data_loader, file_path, mode, logger): """ mode eval: eval on development set and compute P/R/F1, called between training. mode predict: eval on development / test set, then write predictions to \ predict_test.json and predict_test.json.zip \ under args.data_path dir for later submission or evaluation. """ model.eval() probs_all = None seq_len_all = None tok_to_orig_start_index_all = None tok_to_orig_end_index_all = None loss_all = 0 eval_steps = 0 logger.info( "\n----------------------------------IN Evaluate func-----------------------------------\n" ) for batch in tqdm(data_loader, total=len(data_loader)): eval_steps += 1 input_ids, seq_len, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch if args.device == 'cuda': input_ids = input_ids.cuda() labels = labels.cuda() logits = model(input_ids=input_ids) mask = (input_ids != 0) & (input_ids != 1) & (input_ids != 2) loss = criterion(logits, labels, mask) loss_all += loss.detach().cpu().numpy().item() probs = torch.sigmoid(logits).cpu() if probs_all is None: probs_all = probs.numpy() seq_len_all = seq_len.numpy() tok_to_orig_start_index_all = tok_to_orig_start_index.numpy() tok_to_orig_end_index_all = tok_to_orig_end_index.numpy() else: probs_all = np.append(probs_all, probs.numpy(), axis=0) seq_len_all = np.append(seq_len_all, seq_len.numpy(), axis=0) tok_to_orig_start_index_all = np.append( tok_to_orig_start_index_all, tok_to_orig_start_index.numpy(), axis=0) tok_to_orig_end_index_all = np.append( tok_to_orig_end_index_all, tok_to_orig_end_index.numpy(), axis=0) loss_avg = loss_all / eval_steps logger.info("eval loss: %f" % (loss_avg)) id2spo_path = os.path.join(os.path.dirname(file_path), "id2spo.json") with open(id2spo_path, 'r', encoding='utf8') as fp: id2spo = json.load(fp) formatted_outputs = decoding(file_path, id2spo, probs_all, seq_len_all, tok_to_orig_start_index_all, tok_to_orig_end_index_all) if mode == "predict": predict_file_path = os.path.join(args.data_path, 'predictions.json') else: predict_file_path = os.path.join(args.data_path, 'predict_eval.json') predict_zipfile_path = write_prediction_results(formatted_outputs, predict_file_path) if mode == "eval": precision, recall, f1 = get_precision_recall_f1( file_path, predict_zipfile_path) os.system('rm {} {}'.format(predict_file_path, predict_zipfile_path)) return precision, recall, f1 elif mode != "predict": logger.debug("wrong mode for eval func") raise Exception("wrong mode for eval func") logger.info("Finish evaluating.")
def main(args): train_dataset, valid_dataset, vocab, tag_vocab = load_all() model = bilstm_crf.BiLSTM_CRF(vocab, tag_vocab, args.embedding_dim, args.hidden_dim, args.batch_size) print('loading model') model.load_state_dict( torch.load(args.model_path, map_location=lambda storage, loc: storage)) print('loaded model') data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, valid_dataset, 1, True, 1, ) correct = 0 total = 0 f1 = None bio_removed_tags = utils.remove_bio(tag_vocab.get_all()) # iterate over epochs for i, (x, y) in enumerate(data_loader): if i > 100: break model.zero_grad() # print(' '.join([vocab.get_word(index) for index in x[0].data.tolist()])) out = model(x) predicted = utils.remove_bio( [tag_vocab.get_word(index) for index in out[0]]) expected = utils.remove_bio( [tag_vocab.get_word(index) for index in y[0].data.tolist()]) total += len(predicted) correct += sum( [predicted[i] == expected[i] for i in range(len(predicted))]) # print(predicted) # print(expected) f1_temp = utils.compute_f1(predicted, expected, bio_removed_tags) if f1 is None: f1 = f1_temp else: f1 = utils.combine_f1(f1, f1_temp) # break print('correct: {} total: {} accuracy: {}'.format(correct, total, correct / total)) res = (utils.get_precision_recall_f1(f1)) scores = [] for key in res: scores.append(res[key]['f1']) print('{} f1: {}'.format(key, res[key]['f1'])) avg = sum(scores) / sum([score > 0 for score in scores]) print('Average F1: {}'.format(avg))