def test_token_level(): text = 'A B C D.' gold_a = [Annotation('B C', 2, 5, 'PER')] gold_b = [Annotation('A', 0, 1, 'ORG'), Annotation('B', 2, 3, 'PER')] pred_a = [Annotation('B', 2, 3, 'PER'), Annotation('C', 4, 5, 'PER')] pred_b = [Annotation('A', 0, 1, 'ORG'), Annotation('B', 2, 3, 'ORG')] gold = [ Document(name='doc_a', text=text, annotations=gold_a), Document(name='doc_b', text=text, annotations=gold_b) ] predicted = [ Document(name='doc_a', text=text, annotations=pred_a), Document(name='doc_b', text=text, annotations=pred_b) ] evaluator = Evaluator(gold, predicted) scores = evaluator.token_level() assert scores.precision('PER') == 1 assert scores.recall('PER') == 0.6667 assert scores.f_score('PER') == 0.8 assert scores.precision('ORG') == 0.5 assert scores.recall('ORG') == 1 assert scores.f_score('ORG') == 0.6667
def test_entity_level(): gold = [ Document(name='doc_a', text='', annotations=[Annotation('', 3, 6, 'MISC')]), Document(name='doc_b', text='', annotations=[Annotation('', 0, 2, 'PER')]) ] predicted = [ Document(name='doc_a', text='', annotations=[Annotation('', 2, 6, 'MISC')]), Document(name='doc_b', text='', annotations=[Annotation('', 0, 2, 'PER')]) ] evaluator = Evaluator(gold, predicted) scores = evaluator.entity_level() assert scores.micro_avg_f_score() == 0.5 assert scores.macro_avg_f_score() == 0.5 assert scores.f_score('PER') == 1 assert scores.f_score('MISC') == 0
def test_token_annotations(): evaluator = Evaluator(gold=(), predicted=()) doc = Document(name='doc_a', text='A B C D.', annotations=[ Annotation('B C', 2, 5, 'PER'), Annotation('D.', 6, 8, 'ORG') ]) assert evaluator.token_annotations(doc) == ['O', 'PER', 'PER', 'ORG'] assert evaluator.token_annotations( doc, tag_blind=True) == ['O', 'ENT', 'ENT', 'ENT']
def arg_parser(): parser = argparse.ArgumentParser() parser.add_argument("language", help="Language to use for tokenizer", choices=Evaluator.supported_languages()) parser.add_argument("documents_path", help="Path to *.txt files") parser.add_argument("gold_path", help="Path to gold *.ann files") parser.add_argument("pred_path", help="Path to predicted *.ann files") return parser.parse_args()
def main(args, model_dir): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) logger.info('Get sentences...') train_sents, _ = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True) dev_sents, _ = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test, tokenizer, verbose=True) train_sents = train_sents + dev_sents train_sents_filtered = list(filter(lambda sent: not _ignore_sentence(sent), train_sents)) sample_size = int(len(train_sents_filtered) * args.train_sample_frac) rs = RandomState(seed=args.random_seed) train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist() logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)', sample_size, len(train_sents_filtered), args.train_sample_frac) # We need to pass some dev data, otherwise flair raises a ZeroDivisionError # See: https://github.com/zalandoresearch/flair/issues/1139 # We just split the training sample into half and instruct Flair to train_with_dev (see below). half = len(train_sents_sample) // 2 flair_corpus = flair_utils.FilteredCorpus(train=train_sents_sample[:half], dev=train_sents_sample[half:], test=test_sents, ignore_sentence=_ignore_sentence) logger.info(flair_corpus) logger.info('Train model...') tagger = run_bilstmcrf.get_model(flair_corpus, corpus_name=args.corpus, embedding_lang=args.embedding_lang, pooled_contextual_embeddings=True) trainer = ModelTrainer(tagger, flair_corpus) trainer.train(join(model_dir, 'flair'), max_epochs=150, monitor_train=False, train_with_dev=True, save_final_model=args.save_final_model) logger.info('Make predictions...') run_bilstmcrf.make_predictions(tagger, flair_corpus) logger.info('Start evaluation...') evaluator = Evaluator(gold=corpus.test, predicted=flair_utils.flair_sents_to_standoff(test_sents, test_docs)) entity_level_metric = evaluator.entity_level() logger.info('\n{}', entity_level_metric) entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv')) evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv')) evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv')) logger.info('Done.')
def main(args, model_dir): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) logger.info('Get sentences...') train_sents, _ = tagging_utils.standoff_to_sents(corpus.train, tokenizer, verbose=True) dev_sents, _ = tagging_utils.standoff_to_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = tagging_utils.standoff_to_sents(corpus.test, tokenizer, verbose=True) train_sents = train_sents + dev_sents train_sents_filtered = list(filter(_is_not_meta_sentence, train_sents)) sample_size = int(len(train_sents_filtered) * args.train_sample_frac) rs = RandomState(seed=args.random_seed) train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist() logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)', sample_size, len(train_sents_filtered), args.train_sample_frac) logger.info('Compute features...') feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[args.feature_extractor] X_train, y_train = crf_labeler.sents_to_features_and_labels(train_sents_sample, feature_extractor) X_test, _ = crf_labeler.sents_to_features_and_labels(test_sents, feature_extractor) logger.info('len(X_train) = {}'.format(len(X_train))) logger.info('len(y_train) = {}'.format(len(y_train))) logger.info('len(X_test) = {}'.format(len(X_test))) crf = crf_labeler.SentenceFilterCRF( ignore_sentence=meta_sentence_filter, ignored_label='O', algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True ) logger.info('Start training... {}'.format(crf)) crf.fit(X_train, y_train) logger.info('CRF classes: {}'.format(crf.classes_)) logger.info('Make predictions...') y_pred_test = crf.predict(X_test) logger.info('Start evaluation...') evaluator = Evaluator(gold=corpus.test, predicted=tagging_utils.sents_to_standoff(y_pred_test, test_docs)) entity_level_metric = evaluator.entity_level() logger.info('\n{}', entity_level_metric) entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv')) evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv')) evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv')) logger.info('Done.')
def evaluate_documents(gold_docs, pred_docs, language='nl'): return Evaluator(gold_docs, pred_docs, language=language)