예제 #1
0
def test_tagged_corpus_get_tag_statistic():
    train_sentence = Sentence(u'Zalando Research is located in Berlin .')
    train_sentence[0].add_tag(u'ner', u'B-ORG')
    train_sentence[1].add_tag(u'ner', u'E-ORG')
    train_sentence[5].add_tag(u'ner', u'S-LOC')
    dev_sentence = Sentence(
        u'Facebook, Inc. is a company, and Google is one as well.', use_tokenizer=True)
    dev_sentence[0].add_tag(u'ner', u'B-ORG')
    dev_sentence[1].add_tag(u'ner', u'I-ORG')
    dev_sentence[2].add_tag(u'ner', u'E-ORG')
    dev_sentence[8].add_tag(u'ner', u'S-ORG')
    test_sentence = Sentence(u'Nothing to do with companies.')
    tag_to_count_dict = TaggedCorpus._get_tag_to_count(
        [train_sentence, dev_sentence, test_sentence], u'ner')
    assert (1 == tag_to_count_dict[u'S-ORG'])
    assert (1 == tag_to_count_dict[u'S-LOC'])
    assert (2 == tag_to_count_dict[u'B-ORG'])
    assert (2 == tag_to_count_dict[u'E-ORG'])
    assert (1 == tag_to_count_dict[u'I-ORG'])
예제 #2
0
파일: test_data.py 프로젝트: bluesea0/ditk
def test_tagged_corpus_get_tag_statistic():
    train_sentence = Sentence("Zalando Research is located in Berlin .")
    train_sentence[0].add_tag("ner", "B-ORG")
    train_sentence[1].add_tag("ner", "E-ORG")
    train_sentence[5].add_tag("ner", "S-LOC")

    dev_sentence = Sentence(
        "Facebook, Inc. is a company, and Google is one as well.",
        use_tokenizer=True)
    dev_sentence[0].add_tag("ner", "B-ORG")
    dev_sentence[1].add_tag("ner", "I-ORG")
    dev_sentence[2].add_tag("ner", "E-ORG")
    dev_sentence[8].add_tag("ner", "S-ORG")

    test_sentence = Sentence("Nothing to do with companies.")

    tag_to_count_dict = TaggedCorpus._get_tag_to_count(
        [train_sentence, dev_sentence, test_sentence], "ner")

    assert 1 == tag_to_count_dict["S-ORG"]
    assert 1 == tag_to_count_dict["S-LOC"]
    assert 2 == tag_to_count_dict["B-ORG"]
    assert 2 == tag_to_count_dict["E-ORG"]
    assert 1 == tag_to_count_dict["I-ORG"]