def eval(self, dataset_name, log_output=None): dataset = self.datasets.get(dataset_name, None) if dataset is None: return results = [] logger.info('Evaluating {} ({})'.format(self.name, dataset_name)) set_loss = 0 for tokens, labels, chars, seq_lens, char_lens in dataset.get_dataset( volatile=True, gpu=self.gpu): preds, loss = self.model.predict(tokens, labels, seq_lens, chars, char_lens) set_loss += float(loss.data[0]) for pred, gold, seq_len, ts in zip(preds, labels, seq_lens, tokens): l = int(seq_len.data[0]) pred = pred.data.tolist()[:l] gold = gold.data.tolist()[:l] ts = ts.data.tolist()[:l] for p, g, t in zip(pred, gold, ts): t = self.idx_token.get(t, 'UNK') results.append('{} {} {}'.format(t, self.idx_label[g], self.idx_label[p])) results.append('') counts = evaluate(results) overall, by_type = metrics(counts) report(counts) logger.info('Loss: {:.5f}'.format(set_loss)) return SCORES(fscore=overall.fscore, precision=overall.prec, recall=overall.rec, loss=set_loss)
def conlleval_report(documents): """Return conlleval evaluation report for Documents as string.""" # conlleval.py has a file-based API, so use StringIO counts = conlleval_evaluate(documents) report_string = StringIO() report(counts, out=report_string) return report_string.getvalue()
def evaluate(results, idx_token, idx_label, writer=None): """Evaluate prediction results. :param results: A List of which each item is a tuple (predictions, gold labels, sequence lengths, tokens) of a batch. :param idx_token: Index to token dictionary. :param idx_label: Index to label dictionary. :param writer: An object (file object) with a write() function. Extra output. :return: F-score, precision, and recall. """ # b: batch, s: sequence outputs = [] for preds_b, golds_b, len_b, tokens_b in results: for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b, tokens_b): l = int(len_s.item()) preds_s = preds_s.data.tolist()[:l] golds_s = golds_s.data.tolist()[:l] tokens_s = tokens_s.data.tolist()[:l] for p, g, t in zip(preds_s, golds_s, tokens_s): token = idx_token.get(t, C.UNK_INDEX) outputs.append('{} {} {}'.format(token, idx_label.get(g, 0), idx_label.get(p, 0))) outputs.append('') counts = conlleval.evaluate(outputs) overall, by_type = conlleval.metrics(counts) conlleval.report(counts) if writer: conlleval.report(counts, out=writer) writer.flush() return overall.fscore, overall.prec, overall.rec
def main(argv): argparser = argument_parser() args = argparser.parse_args(argv[1:]) seq_len = args.max_seq_length # abbreviation pretrained_model, tokenizer = load_pretrained(args) train_words, train_tags = read_conll(args.train_data) test_words, test_tags = read_conll(args.test_data) train_data = process_sentences(train_words, train_tags, tokenizer, seq_len) test_data = process_sentences(test_words, test_tags, tokenizer, seq_len) label_list = get_labels(train_data.labels) tag_map = {l: i for i, l in enumerate(label_list)} inv_tag_map = {v: k for k, v in tag_map.items()} init_prob, trans_prob = viterbi_probabilities(train_data.labels, tag_map) train_x = encode(train_data.combined_tokens, tokenizer, seq_len) test_x = encode(test_data.combined_tokens, tokenizer, seq_len) train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len) test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len) ner_model = create_ner_model(pretrained_model, len(tag_map)) optimizer = create_optimizer(len(train_x[0]), args) ner_model.compile(optimizer, loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', metrics=['sparse_categorical_accuracy']) ner_model.fit(train_x, train_y, sample_weight=train_weights, epochs=args.num_train_epochs, batch_size=args.batch_size) if args.ner_model_dir is not None: label_list = [v for k, v in sorted(list(inv_tag_map.items()))] save_ner_model(ner_model, tokenizer, label_list, args) save_viterbi_probabilities(init_prob, trans_prob, inv_tag_map, args) probs = ner_model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) pred_tags = [] for i, pred in enumerate(preds): pred_tags.append( [inv_tag_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]]) lines = write_result(args.output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_tags) c = conlleval.evaluate(lines) conlleval.report(c) return 0
def evaluate(args, data, model, id2label, all_ori_tokens): model.eval() sampler = SequentialSampler(data) dataloader = DataLoader(data, sampler=sampler, batch_size=args.train_batch_size) logger.info("***** Running eval *****") # logger.info(f" Num examples = {len(data)}") # logger.info(f" Batch size = {args.eval_batch_size}") pred_labels = [] ori_labels = [] for b_i, (input_ids, input_mask, segment_ids, label_ids, bbox, bbox_pos_id, bbox_num) in enumerate(tqdm(dataloader, desc="Evaluating")): input_ids = input_ids.to(args.device) input_mask = input_mask.to(args.device) segment_ids = segment_ids.to(args.device) label_ids = label_ids.to(args.device) bbox = bbox.to(args.device) bbox_pos_id = bbox_pos_id.to(args.device) bbox_num = bbox_num.to(args.device) with torch.no_grad(): logits = model.predict(input_ids, segment_ids, input_mask, bbox, bbox_pos_id, bbox_num) # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) # logits = logits.detach().cpu().numpy() for l in logits: # logits-> List[List[int]] pred_labels.append([id2label[idx] for idx in l]) for l in label_ids: # tensor ori_labels.append([id2label[idx.item()] for idx in l]) eval_list = [] for ori_tokens, oril, prel in zip(all_ori_tokens, ori_labels, pred_labels): for ot, ol, pl in zip(ori_tokens, oril, prel): if ot in ["[CLS]", "[SEP]"]: continue if len(f"{ot} {ol} {pl}\n".split(" ")) != 3: continue eval_list.append(f"{ot} {ol} {pl}\n") eval_list.append("\n") # eval the model counts = conlleval.evaluate(eval_list) conlleval.report(counts) # namedtuple('Metrics', 'tp fp fn prec rec fscore') overall, by_type = conlleval.metrics(counts) return overall, by_type
def on_epoch_end(self, epoch, logs=None): ypred = self.model.predict(self.test_features) c, cmat = conll_eval_counts(ypred, self.test_ground_truth, self.labels) ceval.report(c, prefix=self.prefix) print_cm(cmat, ordered_label_keys(self.labels)) o, b = ceval.metrics(c) # tensorboard requires those logs to be float64 with attribute item(), thus we create them with numpy logs[self.prefix + "_conll_f1"] = np.float64(o.fscore) logs[self.prefix + "_conll_prec"] = np.float64(o.prec) logs[self.prefix + "_conll_rec"] = np.float64(o.rec)
def evaluate(args, task_id, data, model, id2label, all_ori_words, file_name=None): model.eval() sampler = SequentialSampler(data) dataloader = DataLoader(data, sampler=sampler, batch_size=args.train_batch_size) task_id = torch.tensor(task_id, dtype=torch.long).to(args.device) logger.info("***** Running eval *****") logger.info(f" Num examples = {len(data)}") pred_labels = [] ori_labels = [] for b_i, batch in enumerate(tqdm(dataloader, desc="Evaluating")): batch = tuple(t.to(args.device) for t in batch) if args.need_charcnn: input_word_ids, input_mask, label_ids, label_mask, char_ids = batch else: input_word_ids, input_mask, label_ids, label_mask = batch char_ids = None with torch.no_grad(): logits = model.predict(task_id, input_word_ids, char_ids, input_mask) # print(len(all_ori_words), [len(x) for x in all_ori_words]) # print(len(logits), [len(x) for x in logits]) # print(len(label_ids), [len(x) for x in label_ids]) # print(len(input_mask), [sum(x) for x in input_mask]) # print(len(label_mask), [sum(x) for x in label_mask]) for predL, goldL, maskL in zip(logits, label_ids, label_mask): for p, g, mask in zip(predL, goldL, maskL): if mask.item() == 1: pred_labels.append(id2label[p]) ori_labels.append(id2label[g.item()]) pred_labels.append(None) ori_labels.append(None) ori_words = [] for sent in all_ori_words: ori_words.extend(sent+[None]) eval_list = [] # print(len(pred_labels), len(ori_labels), len(ori_words)) for plabel, olabel, word in zip(pred_labels, ori_labels, ori_words): if plabel is not None: eval_list.append(f"{word} {olabel} {plabel}\n") else: eval_list.append("\n") if file_name is not None: with open(file_name, "w", encoding="utf-8") as f: for line in eval_list: f.write(line) # eval the model counts = conlleval.evaluate(eval_list) conlleval.report(counts)
def calculate_labeling_scores(results, report=True): outputs = [] for p_b, g_b, t_b, l_b in results: for p_s, g_s, t_s, l_s in zip(p_b, g_b, t_b, l_b): p_s = p_s[:l_s] for p, g, t in zip(p_s, g_s, t_s): outputs.append('{} {} {}'.format(t, g, p)) outputs.append('') counts = conlleval.evaluate(outputs) overall, by_type = conlleval.metrics(counts) if report: conlleval.report(counts) return (overall.fscore * 100.0, overall.prec * 100.0, overall.rec * 100.0)
def get_output_file(all_logit, all_label, decode, out): decode.pop(len(decode) - 1) assert len(all_logit) == len(all_label) evalseq = [] for i in range(len(all_logit)): evalseq.append("{} {} {}".format( i, decode[int(all_label[i])] if int(all_label[i]) in decode.keys() else "O", decode[int(all_logit[i])] if int(all_logit[i]) in decode.keys() else "O", )) count = conlleval.evaluate(evalseq) conlleval.report(count, out)
def test_format(): words = "Shyam lives in New York .".split() gold = "B-PER O O B-LOC I-LOC O".split() pred = "B-PER O O B-LOC O O".split() print("Testing inputting the wrong format. This should get an exception") try: evaluate([1, 2, 3]) except Exception as e: print(e) pred = "B-PER O O B-LOC I-MISC O".split() print("This should be 50% F1") counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 0.4
def evaluate(results, idx_token, idx_label, writer=None): """Evaluate prediction results. :param results: A List of which each item is a tuple (predictions, gold labels, sequence lengths, tokens) of a batch. :param idx_token: Index to token dictionary. :param idx_label: Index to label dictionary. :param writer: An object (file object) with a write() function. Extra output. :return: F-score, precision, and recall. """ # b: batch, s: sequence outputs = [] # preds: predictions # golds: answers? # len: length of something # tokens: original words? for preds_b, golds_b, len_b, tokens_b in results: for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b, tokens_b): l = int(len_s.item()) preds_s = preds_s.data.tolist()[:l] golds_s = golds_s.data.tolist()[:l] tokens_s = tokens_s.data.tolist()[:l] for p, g, t in zip(preds_s, golds_s, tokens_s): token = idx_token.get(t, C.UNK) # if token == '': # debug # token = '<$UNK$>' # print(idx_token) # debug # print("p: ", p, ", g: ", g, ", t: ", t, ", corresponding token:", token, "|") # DEBUG outputs.append('{} {} {}'.format( token, idx_label.get(g, 0), idx_label.get(p, 0))) outputs.append('') # print("OUTPUTS: ", outputs) # DEBUG # seems like outputs is right but counts is wrong # Why is english-covered-test not like the other, uncovered datasets? is this causing an issue? counts = conlleval.evaluate(outputs) # print("counts: ", counts) # DEBUG overall, by_type = conlleval.metrics(counts) conlleval.report(counts) if writer: conlleval.report(counts, out=writer) writer.flush() return overall.fscore, overall.prec, overall.rec
def test_entities_at_the_end(): words = "Shyam lives in New York".split() gold = "B-PER O O B-LOC I-LOC".split() pred = "B-PER O O B-LOC O".split() print("Input gold. This should be perfect.") counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, gold))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 1.0 print("This should be 50% F1") counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 0.5 assert by_type["PER"].fscore == 1.0 assert by_type["LOC"].fscore == 0.0 print("This should be 50% F1") counts = evaluate(map(lambda p: " ".join(p), zip(words, pred, gold))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 0.5 assert by_type["PER"].fscore == 1.0 assert by_type["LOC"].fscore == 0.0
def compare(gold_toks, gold_tags, pred_toks, pred_tags): if len(gold_toks) != len(pred_toks): raise ValueError('sentence count mismatch: {} in gold, {} in pred'.\ format(len(gold_toks), len(pred_toks))) lines = [] for g_toks, g_tags, p_toks, p_tags in zip(gold_toks, gold_tags, pred_toks, pred_tags): if g_toks != p_toks: raise ValueError('text mismatch: gold "{}", pred "{}"'.\ format(g_toks, p_toks)) for (g_tok, g_tag, p_tag) in zip(g_toks, g_tags, p_tags): lines.append('{}\t{}\t{}'.format(g_tok, g_tag, p_tag)) return conlleval.report(conlleval.evaluate(lines))
def predict_test_file(fname, input_dim, timesteps, nlabels, labels): print('loading data from file ', fname) df = pd.read_csv(fname, sep=' ', header=0) X = extract_features(df, timesteps, input_dim) y = extract_labels(df, timesteps, nlabels) print('X temporal reshape: ', X.shape) print('y temporal reshape: ', y.shape) print('#samples: ', len(X)) print('#labels: ', len(y)) # we are averaging over all models output probabilities and then just taking the max m_preds = np.zeros((X.shape[0], timesteps, nlabels)) for model in models: m_preds = m_preds + model.predict(X) break m_preds = m_preds / len(models) # just count and report and we are done counts, conf_matrix = conll_eval_counts(m_preds, y, labels) print('file: ', fname) ceval.report(counts) print_cm(conf_matrix, ordered_label_keys(labels))
def biobert_metrics(model: NERInferenceSession, input_path: str, output_path: str): with open(input_path, "r") as f: data = f.readlines() total = 0 for i in data: if i == "\n": total += 1 print("Running over " + str(total) + " sentences") #changed counter to total confusion_matrix: CounterT[str] = Counter() token_matrix: DefaultDict[str, DefaultDict[str, int]] = defaultdict( lambda: defaultdict(int)) gs_labels: List[str] = [] sequence = "" line_list = list() counter = 0 for line in data: if line == "\n": counter += 1 sys.stdout.write("Predicted {}/{} sentences so far.\r".format( counter, total)) sys.stdout.flush() pred_pairs = model.predict(sequence.strip()) tokens = sequence.strip().split() # The tokenization label X and special labels hold no more value pred_labels = [ label[1] for label in pred_pairs if label[1] != 'X' and label[0] != '[CLS]' and label[0] != '[SEP]' ] cm, tm = sentence_metrics(pred_labels, gs_labels) confusion_matrix.update(cm) for gs_label in tm: for pred_label in tm[gs_label]: token_matrix[gs_label][pred_label] += tm[gs_label][ pred_label] line_list = line_list + list( map(lambda token, gs, pred: token + " TK " + gs + " " + pred, tokens, gs_labels, pred_labels)) gs_labels = [] sequence = "" continue columns = line.split("\t") sequence += columns[0] + " " gs_labels.append(columns[1].strip()) #if counter_2 == 1000: #break conlleval_res = conlleval.report(conlleval.evaluate(line_list)) print(conlleval_res) # CM cm_r = confusion_matrix["true_positive"] / ( confusion_matrix["true_positive"] + confusion_matrix["false_negative"]) cm_p = confusion_matrix["true_positive"] / ( confusion_matrix["true_positive"] + confusion_matrix["false_positive"]) cm_f1 = 2 * cm_r * cm_p / (cm_r + cm_p) # TM b_r = token_matrix["B"]["B"] / (token_matrix["B"]["B"] + token_matrix["B"]["I"] + token_matrix["B"]["O"]) b_p = token_matrix["B"]["B"] / (token_matrix["B"]["B"] + token_matrix["I"]["B"] + token_matrix["O"]["B"]) b_f1 = 2 * b_r * b_p / (b_r + b_p) i_r = token_matrix["I"]["I"] / (token_matrix["I"]["B"] + token_matrix["I"]["I"] + token_matrix["I"]["O"]) i_p = token_matrix["I"]["I"] / (token_matrix["B"]["I"] + token_matrix["I"]["I"] + token_matrix["O"]["I"]) i_f1 = 2 * i_r * i_p / (i_r + i_p) o_r = token_matrix["O"]["O"] / (token_matrix["O"]["B"] + token_matrix["O"]["I"] + token_matrix["O"]["O"]) o_p = token_matrix["O"]["O"] / (token_matrix["B"]["O"] + token_matrix["I"]["O"] + token_matrix["O"]["O"]) o_f1 = 2 * o_r * o_p / (o_r + o_p) with open(output_path, "a+") as out_f: out_f.write("\nConlleval results:\n" + conlleval_res) out_f.write("\nToken-Level Confusion Matrix:\n" + "True Positive:\t" + str(confusion_matrix["true_positive"]) + "\nTrue Negative:\t" + str(confusion_matrix["true_negative"]) + "\nFalse Positive:\t" + str(confusion_matrix["false_positive"]) + "\nFalse Negative:\t" + str(confusion_matrix["false_negative"]) + "\nRecall:\t\t" + str(cm_r) + "\nPrecision:\t" + str(cm_p) + "\nF1-score:\t" + str(cm_f1)) out_f.write("\n\nToken Matrix (true\predicted):\n\tB\tI\tO\n" + "B\t" + str(token_matrix["B"]["B"]) + "\t" + str(token_matrix["B"]["I"]) + "\t" + str(token_matrix["B"]["O"]) + "\nI\t" + str(token_matrix["I"]["B"]) + "\t" + str(token_matrix["I"]["I"]) + "\t" + str(token_matrix["I"]["O"]) + "\nO\t" + str(token_matrix["O"]["B"]) + "\t" + str(token_matrix["O"]["I"]) + "\t" + str(token_matrix["O"]["O"]) + "\nB_Recall:\t" + str(b_r) + "\nB_Precision:\t" + str(b_p) + "\nB_F1:\t\t" + str(b_f1) + "\nI_Recall:\t" + str(i_r) + "\nI_Precision:\t" + str(i_p) + "\nI_F1:\t\t" + str(i_f1) + "\nO_Recall:\t" + str(o_r) + "\nO_Precision:\t" + str(o_p) + "\nO_F1:\t\t" + str(o_f1) + "\n") print("Confusion matrix:") print({**confusion_matrix}) print("Recall: " + str(cm_r)) print("Precision: " + str(cm_p)) print() print("Token matrix:") print({**token_matrix}) print()
words, infer_tags, unknown_tokens = ner.infer(sentence=item["sentence"], true_tags=item["labels"]) cm, tm, em = sentence_metrics(infer_tags, item["labels"]) confusion_matrix.update(cm) entity_matrix.update(em) for gs_label in tm: for pred_label in tm[gs_label]: token_matrix[gs_label][pred_label] += tm[gs_label][pred_label] line_list = line_list + list( map(lambda token, gs, pred: token + " TK " + gs + " " + pred, item["sentence"].split(), item["labels"], infer_tags)) conlleval_res = conlleval.report(conlleval.evaluate(line_list)) print(conlleval_res) # CM cm_r = confusion_matrix["true_positive"] / (confusion_matrix["true_positive"] + confusion_matrix["false_negative"]) cm_p = confusion_matrix["true_positive"] / (confusion_matrix["true_positive"] + confusion_matrix["false_positive"]) cm_f1 = 2 * cm_r * cm_p / (cm_r + cm_p) # EM em_r = entity_matrix["true_positive"] / (entity_matrix["true_positive"] + entity_matrix["false_negative"]) em_p = entity_matrix["true_positive"] / (entity_matrix["true_positive"] + entity_matrix["false_positive"]) em_f1 = 2 * em_r * em_p / (em_r + em_p)
def main(argv): argparser = argument_parser() args = argparser.parse_args(argv[1:]) seq_len = args.max_seq_length # abbreviation pretrained_model, tokenizer = load_pretrained(args) train_words, train_tags = read_conll(args.train_data) test_words, test_tags = read_conll(args.test_data) print(args.no_context) if args.no_context: train_data = process_no_context(train_words, train_tags, tokenizer, seq_len) test_data = process_no_context(test_words, test_tags, tokenizer, seq_len) elif args.documentwise: tr_docs, tr_doc_tags, tr_line_ids = split_to_documents(train_words, train_tags) te_docs, te_doc_tags, te_line_ids = split_to_documents(test_words, test_tags) train_data = process_docs(tr_docs, tr_doc_tags, tr_line_ids, tokenizer, seq_len) test_data = process_docs(te_docs, te_doc_tags, te_line_ids, tokenizer, seq_len) else: train_data = process_sentences(train_words, train_tags, tokenizer, seq_len, args.predict_position) test_data = process_sentences(test_words, test_tags, tokenizer, seq_len, args.predict_position) label_list = get_labels(train_data.labels) tag_map = { l: i for i, l in enumerate(label_list) } inv_tag_map = { v: k for k, v in tag_map.items() } train_x = encode(train_data.combined_tokens, tokenizer, seq_len) test_x = encode(test_data.combined_tokens, tokenizer, seq_len) train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len) test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len) if args.use_ner_model and (args.ner_model_dir is not None): ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir) else: optimizer = create_optimizer(len(train_x[0]), args) model = create_ner_model(pretrained_model, len(tag_map)) if args.num_gpus > 1: ner_model = multi_gpu_model(model, args.num_gpus) else: ner_model = model ner_model.compile( optimizer, loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', metrics=['sparse_categorical_accuracy'] ) ner_model.fit( train_x, train_y, sample_weight=train_weights, epochs=args.num_train_epochs, batch_size=args.batch_size ) if args.ner_model_dir is not None: label_list = [v for k, v in sorted(list(inv_tag_map.items()))] save_ner_model(ner_model, tokenizer, label_list, args) probs = ner_model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) results = [] m_names = [] if args.no_context: pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers) output_file = "output/{}-NC.tsv".format(args.output_file) m_names.append('NC') ensemble = [] for i,pred in enumerate(pr_test_first): ensemble.append([inv_tag_map[t] for t in pred]) lines_ensemble, sentences_ensemble = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, ensemble ) c = conlleval.evaluate(lines_ensemble) conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) else: # First tag then vote pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers) # Accumulate probabilities, then vote prob_ensemble, prob_test_first = get_predictions2(probs, test_data.tokens, test_data.sentence_numbers) ens = [pr_ensemble, prob_ensemble, pr_test_first, prob_test_first] if args.documentwise: # D-CMV: Documentwise CMV # D-CMVP: Documetwise CMV, probs summed, argmax after that # D-F: Documentwise First # D-FP: Same as D-FP method_names = ['D-CMV','D-CMVP','D-F','D-FP'] else: method_names = ['CMV','CMVP','F','FP'] for i, ensem in enumerate(ens): ensemble = [] for j,pred in enumerate(ensem): ensemble.append([inv_tag_map[t] for t in pred]) output_file = "output/{}-{}.tsv".format(args.output_file, method_names[i]) lines_ensemble, sentences_ensemble = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, ensemble) print("Model trained: ", args.ner_model_dir) print("Seq-len: ", args.max_seq_length) print("Learning rate: ", args.learning_rate) print("Batch Size: ", args.batch_size) print("Epochs: ", args.num_train_epochs) print("Training data: ", args.train_data) print("Testing data: ", args.test_data) print("") print("Results with {}".format(method_names[i])) c = conlleval.evaluate(lines_ensemble) print("") conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) m_names.extend(method_names) if args.sentence_in_context: starting_pos = np.arange(0,seq_len+1,32) starting_pos[0] = 1 m_names.extend(starting_pos) for start_p in starting_pos: tt_lines, tt_tags, line_nos, line_starts = combine_sentences2(test_data.tokens, test_data.labels, seq_len-1, start_p-1) tt_x = encode(tt_lines, tokenizer, seq_len) tt_y, train_weights = label_encode(tt_tags, tag_map, seq_len) probs = ner_model.predict(tt_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) pred_tags = [] for i, pred in enumerate(preds): idx = line_nos[i].index(i) pred_tags.append([inv_tag_map[t] for t in pred[line_starts[i][idx]+1:line_starts[i][idx]+len(test_data.tokens[i])+1]]) output_file = "output/{}-{}.tsv".format(args.output_file, start_p) lines_first, sentences_first = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_tags ) print("") print("Results with prediction starting position ", start_p) c = conlleval.evaluate(lines_first) conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) result_file = "./results/results-{}.csv".format(args.output_file) with open(result_file, 'w+') as f: for i, line in enumerate(results): params = "{},{},{},{},{},{},{},{},{}".format(args.output_file, args.max_seq_length, args.bert_config_file, args.num_train_epochs, args.learning_rate, args.batch_size, args.predict_position, args.train_data, args.test_data) f.write(params) f.write(",{}".format(m_names[i])) for item in line: f.write(",{}".format(item)) f.write('\n') for i in results: print(i) return 0
def evaluate_conlleval_string(self, conlleval_string): counts = conlleval.evaluate(conlleval_string.split('\n'), {'delimiter': self.separator}) full_report = conlleval.report(counts) overall, per_label = conlleval.metrics(counts) return overall, per_label, full_report
histogram_freq=1) ] model.fit(xtr, ytr, batch_size=batch_size, epochs=nb_epoch, verbose=1, validation_data=(xte, yte), callbacks=callbacks) print('loading the currently best model for final evaluation...') model = load_model(checkPointPath) print('--------------------------------------------------') print('Fold ', currentFold, ' performance') counts, cmat = conll_eval_counts(model.predict(xte), yte, labels) overall, byType = ceval.metrics(counts) ceval.report(counts) print_cm(cmat, ordered_label_keys(labels)) foldScores.append(overall.fscore) print('\n') print('avg f1 fold scores so far: ', np.mean(foldScores)) currentFold += 1 # we clear the tensorflow session after each fold to not leak resources K.clear_session() print('f1 fold scores: ', foldScores) print('final avg f1 fold scores: ', np.mean(foldScores))
def evaluate_evaluation_string(self, connl_evaluation_string): counts = conlleval.evaluate(connl_evaluation_string.split('\n'), {'delimiter': self.separator}) return conlleval.report(counts)
def fit(self, X, y, X_dev, y_dev, num_epoch = 10, batch_size = 32, seed = 1): random.seed(seed) trainset = zip(X, [ self._onehot(l,self.labels) for l in y ]) devset = zip(X_dev, [ self._onehot(l,self.labels) for l in y_dev ]) print "Target labels: {}".format(self.labels) train_split = trainset valid_split = devset print "{}/{} in training/validation set".format(len(train_split),len(valid_split)) trainsp = random.sample(train_split,min(len(X)/2,200)) trainfd = self.compiler.build_feed_dict(trainsp) valfd = self.compiler.build_feed_dict(valid_split) best_epoch = 0 best_model = None best_score = 0 epochs_since_best = 0 for i in range(1,num_epoch+1): estart = time.time() batchpool = random.sample(train_split,len(train_split)) minibatches = [] for k in range(0,len(batchpool),batch_size): pool = batchpool[k:k+batch_size] minibatches.append(self.compiler.build_feed_dict(pool)) self._train_minibatches(minibatches) self.sess.run(self.epoch_step_op) loss, yt_pred, yt_true = self.sess.run([self.y_loss, self.y_pred_idx, self.y_true_idx], trainfd) f1, precision, recall = self.fscore(yt_pred,yt_true) yv_pred, yv_true = self.sess.run([self.y_pred_idx, self.y_true_idx], valfd) vf1, vprecision, vrecall = self.fscore(yv_pred,yv_true) pred_dev = self.predict(X_dev) output_dev = [] for (x,y,z) in zip(X_dev,y_dev,pred_dev): for token, y_true, y_pred in zip(x,y,z): output_dev.append('{} {} {}'.format(token, y_true, y_pred)) output_dev.append('') vfb1 = conlleval.report(conlleval.evaluate(output_dev)) save_marker = '' if vfb1 >= best_score: best_model = '/tmp/model-{}-e{}-s{}.ckpt'.format( type(self).__name__.lower(),i,seed) best_epoch, best_score = i, vfb1 self.saver.save(self.sess, best_model) save_marker = '*' epochs_since_best = 0 else: epochs_since_best += 1 elapsed = int(time.time() - estart) emin, esec = elapsed / 60, elapsed % 60 print "epoch {} loss {} fit {:.2f} val {:.2f}/{:.2f}/{:.2f} @ {:.2f} [{}m{}s] {}".format(i, loss, f1, vf1, vprecision, vrecall, vfb1, emin, esec, save_marker) if epochs_since_best > 10: print "Stopping early from lack of improvements.." break if best_model is None: print "WARNING: NO GOOD FIT" self.saver.restore(self.sess, best_model) print "Fitted to model from epoch {} with score {} at {}".format(best_epoch,best_score,best_model)