Пример #1
0
def test_tagged_corpus_get_tag_statistic():
    train_sentence = Sentence("Zalando Research is located in Berlin .")
    train_sentence[0].add_tag("ner", "B-ORG")
    train_sentence[1].add_tag("ner", "E-ORG")
    train_sentence[5].add_tag("ner", "S-LOC")

    dev_sentence = Sentence(
        "Facebook, Inc. is a company, and Google is one as well.",
        use_tokenizer=segtok_tokenizer,
    )
    dev_sentence[0].add_tag("ner", "B-ORG")
    dev_sentence[1].add_tag("ner", "I-ORG")
    dev_sentence[2].add_tag("ner", "E-ORG")
    dev_sentence[8].add_tag("ner", "S-ORG")

    test_sentence = Sentence("Nothing to do with companies.")

    tag_to_count_dict = Corpus._get_tag_to_count(
        [train_sentence, dev_sentence, test_sentence], "ner")

    assert 1 == tag_to_count_dict["S-ORG"]
    assert 1 == tag_to_count_dict["S-LOC"]
    assert 2 == tag_to_count_dict["B-ORG"]
    assert 2 == tag_to_count_dict["E-ORG"]
    assert 1 == tag_to_count_dict["I-ORG"]
Пример #2
0
def test_tagged_corpus_get_tag_statistic():
    train_sentence = Sentence('Zalando Research is located in Berlin .')
    train_sentence[0].add_tag('ner', 'B-ORG')
    train_sentence[1].add_tag('ner', 'E-ORG')
    train_sentence[5].add_tag('ner', 'S-LOC')
    dev_sentence = Sentence(
        'Facebook, Inc. is a company, and Google is one as well.',
        use_tokenizer=True)
    dev_sentence[0].add_tag('ner', 'B-ORG')
    dev_sentence[1].add_tag('ner', 'I-ORG')
    dev_sentence[2].add_tag('ner', 'E-ORG')
    dev_sentence[8].add_tag('ner', 'S-ORG')
    test_sentence = Sentence('Nothing to do with companies.')
    tag_to_count_dict = Corpus._get_tag_to_count(
        [train_sentence, dev_sentence, test_sentence], 'ner')
    assert (1 == tag_to_count_dict['S-ORG'])
    assert (1 == tag_to_count_dict['S-LOC'])
    assert (2 == tag_to_count_dict['B-ORG'])
    assert (2 == tag_to_count_dict['E-ORG'])
    assert (1 == tag_to_count_dict['I-ORG'])