def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence("I love Berlin.", labels=["class_1"], use_tokenizer=True) dev_sentence = Sentence("The sun is shining.", labels=["class_2"], use_tokenizer=True) test_sentence = Sentence("Berlin is sunny.", labels=["class_1", "class_2"], use_tokenizer=True) class_to_count_dict = TaggedCorpus._get_class_to_count( [train_sentence, dev_sentence, test_sentence]) assert "class_1" in class_to_count_dict assert "class_2" in class_to_count_dict assert 2 == class_to_count_dict["class_1"] assert 2 == class_to_count_dict["class_2"] tokens_in_sentences = TaggedCorpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert 3 == len(tokens_in_sentences) assert 4 == tokens_in_sentences[0] assert 5 == tokens_in_sentences[1] assert 4 == tokens_in_sentences[2]
def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence('I love Berlin.', labels=['class_1'], use_tokenizer='segtok') dev_sentence = Sentence('The sun is shining.', labels=['class_2'], use_tokenizer='segtok') test_sentence = Sentence('Berlin is sunny.', labels=['class_1', 'class_2'], use_tokenizer='segtok') class_to_count_dict = TaggedCorpus._get_classes_to_count( [train_sentence, dev_sentence, test_sentence]) assert ('class_1' in class_to_count_dict) assert ('class_2' in class_to_count_dict) assert (2 == class_to_count_dict['class_1']) assert (2 == class_to_count_dict['class_2']) tokens_in_sentences = TaggedCorpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert (3 == len(tokens_in_sentences)) assert (4 == tokens_in_sentences[0]) assert (5 == tokens_in_sentences[1]) assert (4 == tokens_in_sentences[2])
def test_tagged_corpus_statistics(): train_sentence = Sentence(u'I love Berlin.', labels=[ Label(u'class_1')], use_tokenizer=True) dev_sentence = Sentence(u'The sun is shining.', labels=[ Label(u'class_2')], use_tokenizer=True) test_sentence = Sentence(u'Berlin is sunny.', labels=[ Label(u'class_1')], use_tokenizer=True) class_to_count_dict = TaggedCorpus._get_class_to_count( [train_sentence, dev_sentence, test_sentence]) assert (u'class_1' in class_to_count_dict) assert (u'class_2' in class_to_count_dict) assert (2 == class_to_count_dict[u'class_1']) assert (1 == class_to_count_dict[u'class_2']) tokens_in_sentences = TaggedCorpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert (3 == len(tokens_in_sentences)) assert (4 == tokens_in_sentences[0]) assert (5 == tokens_in_sentences[1]) assert (4 == tokens_in_sentences[2])