def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence("I love Berlin.", labels=["class_1"], use_tokenizer=segtok_tokenizer) dev_sentence = Sentence("The sun is shining.", labels=["class_2"], use_tokenizer=segtok_tokenizer) test_sentence = Sentence( "Berlin is sunny.", labels=["class_1", "class_2"], use_tokenizer=segtok_tokenizer, ) class_to_count_dict = Corpus._get_class_to_count( [train_sentence, dev_sentence, test_sentence]) assert "class_1" in class_to_count_dict assert "class_2" in class_to_count_dict assert 2 == class_to_count_dict["class_1"] assert 2 == class_to_count_dict["class_2"] tokens_in_sentences = Corpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert 3 == len(tokens_in_sentences) assert 4 == tokens_in_sentences[0] assert 5 == tokens_in_sentences[1] assert 4 == tokens_in_sentences[2]
def class_distribution(self, multiclass: bool = False, nr_classes: int = 10, savefig_file=None, **kwargs): class_count = Corpus._get_class_to_count(self.sentences) class_count = pd.DataFrame.from_dict(class_count, orient='index', columns=['count']).sort_values( 'count', ascending=False) html_table = class_count.to_html() # plot distribution class_count_top = class_count[:nr_classes].copy() if not multiclass: if nr_classes < len(class_count): class_count_top.loc['others'] = class_count[nr_classes:].sum() # pie plot class_count class_count_top.plot.pie(y='count', **kwargs) plt.legend(labels=class_count_top.index, bbox_to_anchor=(1, 0, 0.1, 1), loc='center right') else: class_count_top.plot.bar(y='count', **kwargs) plt.gca().yaxis.grid(True, linestyle='--') plt.tight_layout() if savefig_file: plt.savefig(self.path / savefig_file, dpi=600) plt.show()
def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence('I love Berlin.', labels=['class_1'], use_tokenizer=True) dev_sentence = Sentence('The sun is shining.', labels=['class_2'], use_tokenizer=True) test_sentence = Sentence('Berlin is sunny.', labels=['class_1', 'class_2'], use_tokenizer=True) class_to_count_dict = Corpus._get_class_to_count( [train_sentence, dev_sentence, test_sentence]) assert ('class_1' in class_to_count_dict) assert ('class_2' in class_to_count_dict) assert (2 == class_to_count_dict['class_1']) assert (2 == class_to_count_dict['class_2']) tokens_in_sentences = Corpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence]) assert (3 == len(tokens_in_sentences)) assert (4 == tokens_in_sentences[0]) assert (5 == tokens_in_sentences[1]) assert (4 == tokens_in_sentences[2])