Exemplo n.º 1
0
class TestNerDatasets(unittest.TestCase):
    def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset

    def test_ddt_dataset(self):
        train, dev, test = self.ddt.load_as_conllu(predefined_splits=True)

        self.assertIsInstance(train, Conll)
        self.assertIsInstance(dev, Conll)
        self.assertIsInstance(test, Conll)

        self.assertEqual(
            [len(train), len(dev), len(test)],
            [self.train_len, self.dev_len, self.test_len])

        full_dataset = self.ddt.load_as_conllu(predefined_splits=False)
        self.assertEqual(len(full_dataset),
                         self.train_len + self.dev_len + self.test_len)

    def test_ddt_simple_ner(self):
        train, dev, test = self.ddt.load_as_simple_ner(predefined_splits=True)

        self.assertEqual([len(
            train[0]), len(dev[0]), len(test[0])],
                         [self.train_len, self.dev_len, self.test_len])

        all_sentences, all_entities = self.ddt.load_as_simple_ner(
            predefined_splits=False)
        self.assertEqual(len(all_sentences),
                         self.train_len + self.dev_len + self.test_len)

        data = defaultdict(int)
        for entities in train[1]:
            for entity in entities:
                if "B" in entity:
                    data[entity[2:]] += 1
        self.assertDictEqual(data, {
            'ORG': 802,
            'LOC': 945,
            'PER': 1249,
            'MISC': 1007
        })

    def test_ddt_dataset_with_flair(self):
        flair_corpus = self.ddt.load_with_flair()

        self.assertIsInstance(flair_corpus, ColumnCorpus)

        flair_lens = [
            len(flair_corpus.train),
            len(flair_corpus.dev),
            len(flair_corpus.test)
        ]
        self.assertEqual(flair_lens,
                         [self.train_len, self.dev_len, self.test_len])

        ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC', b'B-MISC', b'I-ORG', b'I-PER',
            b'I-LOC', b'I-MISC', b'O', b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

    def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()

        num_sents_train = 0
        for paragraph in [
                paragraph[1] for paragraph in list(corpus.train_tuples)
        ]:
            num_sents_train += len(paragraph)

        self.assertIsInstance(corpus, GoldCorpus)
        self.assertEqual(self.train_len, num_sents_train)

    def test_wikiann_dataset(self):
        # Change to a sample of the full wikiann to ease test computation
        DATASETS['wikiann'][
            'url'] = "https://danlp.s3.eu-central-1.amazonaws.com/test-datasets/da.tar.gz"
        DATASETS['wikiann']['size'] = 2502
        DATASETS['wikiann'][
            'md5_checksum'] = 'd0271de38ae23f215b5117450efb9ace'

        wikiann = WikiAnn()

        corpus = wikiann.load_with_flair()

        self.assertEqual(
            [len(corpus.train),
             len(corpus.dev),
             len(corpus.test)], [21, 2, 3])

        ner_tags = corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC', b'I-ORG', b'I-PER', b'I-LOC', b'O',
            b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

        spacy_gold = wikiann.load_with_spacy()
        self.assertIsInstance(spacy_gold, GoldCorpus)

        num_train_sents = len(list(spacy_gold.train_tuples)[0][1])
        num_dev_sents = len(list(spacy_gold.dev_tuples)[0][1])
        self.assertEqual(num_dev_sents + num_train_sents, 26)

        shutil.rmtree(wikiann.dataset_dir)

    def test_wordsim353(self):
        ws353 = WordSim353Da()
        df = ws353.load_with_pandas()

        self.assertEqual(len(df), 353)
        self.assertListEqual(list(df.columns), ['da1', 'da2', 'Human (mean)'])
        self.assertEqual(len(ws353.words()), 424)

    def test_dsd(self):
        dsd = DSD()
        df = dsd.load_with_pandas()

        self.assertEqual(len(df), 99)
        self.assertListEqual(list(df.columns),
                             ['word1', 'word2', 'similarity'])
        self.assertEqual(len(dsd.words()), 197)

    def test_europarlsentiment(self):
        eusent = EuroparlSentiment()
        df = eusent.load_with_pandas()
        self.assertEqual(len(df), 184)

    def test_lccsentiment(self):
        sent = LccSentiment()
        df = sent.load_with_pandas()
        self.assertEqual(len(df), 499)
Exemplo n.º 2
0
class TestNerDatasets(unittest.TestCase):
    def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset

    def test_write_and_read_simple_ner_dataset(self):
        sentences = [["Jeg", "gik", "en", "tur", "i", "København"],
                     ["Alexandra", "Instituttet", "arbejder", "med", "NLP"]]

        entities = [["O", "O", "O", "O", "O", "B-LOC"],
                    ["B-ORG", "I-ORG", "O", "O", "O"]]
        tmp_file = NamedTemporaryFile().name
        write_simple_ner_dataset(sentences, entities, tmp_file)

        loaded_sents, loaded_ents = read_simple_ner_dataset(tmp_file)

        self.assertEqual(sentences, loaded_sents)
        self.assertEqual(entities, loaded_ents)

    def test_ddt_dataset(self):
        train, dev, test = self.ddt.load_as_conllu(predefined_splits=True)

        self.assertIsInstance(train, Conll)
        self.assertIsInstance(dev, Conll)
        self.assertIsInstance(test, Conll)

        self.assertEqual(
            [len(train), len(dev), len(test)],
            [self.train_len, self.dev_len, self.test_len])

        full_dataset = self.ddt.load_as_conllu(predefined_splits=False)
        self.assertEqual(len(full_dataset),
                         self.train_len + self.dev_len + self.test_len)

    def test_ddt_simple_ner(self):
        train, dev, test = self.ddt.load_as_simple_ner(predefined_splits=True)

        self.assertEqual([len(
            train[0]), len(dev[0]), len(test[0])],
                         [self.train_len, self.dev_len, self.test_len])

        all_sentences, all_entities = self.ddt.load_as_simple_ner(
            predefined_splits=False)
        self.assertEqual(len(all_sentences),
                         self.train_len + self.dev_len + self.test_len)

        data = defaultdict(int)
        for entities in train[1]:
            for entity in entities:
                if "B" in entity:
                    data[entity[2:]] += 1
        self.assertDictEqual(data, {
            'ORG': 802,
            'LOC': 945,
            'PER': 1249,
            'MISC': 1007
        })

    def test_ddt_dataset_with_flair(self):
        flair_corpus = self.ddt.load_with_flair()

        self.assertIsInstance(flair_corpus, ColumnCorpus)

        flair_lens = [
            len(flair_corpus.train),
            len(flair_corpus.dev),
            len(flair_corpus.test)
        ]
        self.assertEqual(flair_lens,
                         [self.train_len, self.dev_len, self.test_len])

        ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC', b'B-MISC', b'I-ORG', b'I-PER',
            b'I-LOC', b'I-MISC', b'O', b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

    def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()

        num_sents_train = 0
        for paragraph in [
                paragraph[1] for paragraph in list(corpus.train_tuples)
        ]:
            num_sents_train += len(paragraph)

        self.assertIsInstance(corpus, GoldCorpus)
        self.assertEqual(self.train_len, num_sents_train)

    def test_wikiann_dataset(self):
        # Change to a sample of the full wikiann to ease test computation
        DATASETS['wikiann']['url'] = DANLP_STORAGE_URL + "/tests/da.tar.gz"
        DATASETS['wikiann']['size'] = 2502
        DATASETS['wikiann'][
            'md5_checksum'] = 'd0271de38ae23f215b5117450efb9ace'

        wikiann = WikiAnn()

        corpus = wikiann.load_with_flair()

        self.assertEqual(
            [len(corpus.train),
             len(corpus.dev),
             len(corpus.test)], [21, 2, 3])

        ner_tags = corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC', b'I-ORG', b'I-PER', b'I-LOC', b'O',
            b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

        spacy_gold = wikiann.load_with_spacy()
        self.assertIsInstance(spacy_gold, GoldCorpus)

        num_train_sents = len(list(spacy_gold.train_tuples)[0][1])
        num_dev_sents = len(list(spacy_gold.dev_tuples)[0][1])
        self.assertEqual(num_dev_sents + num_train_sents, 26)

        shutil.rmtree(wikiann.dataset_dir)
Exemplo n.º 3
0
class TestNerDatasets(unittest.TestCase):

    def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset

    def test_ddt_dataset(self):
        train, dev, test = self.ddt.load_as_conllu(predefined_splits=True)

        self.assertIsInstance(train, Conll)
        self.assertIsInstance(dev, Conll)
        self.assertIsInstance(test, Conll)

        self.assertEqual([len(train), len(dev), len(test)], [self.train_len, self.dev_len, self.test_len])

        full_dataset = self.ddt.load_as_conllu(predefined_splits=False)
        self.assertEqual(len(full_dataset), self.train_len + self.dev_len + self.test_len)

    def test_ddt_dataset_with_flair(self):
        flair_corpus = self.ddt.load_with_flair()

        self.assertIsInstance(flair_corpus, ColumnCorpus)

        flair_lens = [len(flair_corpus.train), len(flair_corpus.dev), len(flair_corpus.test)]
        self.assertEqual(flair_lens, [self.train_len, self.dev_len, self.test_len])

        ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC',
            b'I-ORG', b'I-PER', b'I-LOC',
            b'O', b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

    def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()
        self.assertIsInstance(corpus, GoldCorpus)

    def test_wikiann_dataset(self):
        # Change to a sample of the full wikiann to ease test computation
        DATASETS['wikiann']['url'] = "https://danlp.s3.eu-central-1.amazonaws.com/test-datasets/da.tar.gz"
        DATASETS['wikiann']['size'] = 2502
        DATASETS['wikiann']['md5_checksum'] = 'd0271de38ae23f215b5117450efb9ace'

        wikiann = WikiAnn()

        corpus = wikiann.load_with_flair()

        self.assertEqual([len(corpus.train), len(corpus.dev), len(corpus.test)], [21, 2, 3])

        ner_tags = corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC',
            b'I-ORG', b'I-PER', b'I-LOC',
            b'O', b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

        spacy_gold = wikiann.load_with_spacy()
        self.assertIsInstance(spacy_gold, GoldCorpus)

        num_train_sents = len(list(spacy_gold.train_tuples)[0][1])
        num_dev_sents = len(list(spacy_gold.dev_tuples)[0][1])
        self.assertEqual(num_dev_sents + num_train_sents, 26)

        shutil.rmtree(wikiann.dataset_dir)

    def test_wordsim353(self):
        ws353 = WordSim353Da()
        df = ws353.load_with_pandas()

        self.assertEqual(len(df), 353)
        self.assertListEqual(list(df.columns), ['da1', 'da2', 'Human (mean)'])
        self.assertEqual(len(ws353.words()), 424)

    def test_dsd(self):
        dsd = DSD()
        df = dsd.load_with_pandas()

        self.assertEqual(len(df), 99)
        self.assertListEqual(list(df.columns), ['word1', 'word2', 'similarity'])
        self.assertEqual(len(dsd.words()), 197)
Exemplo n.º 4
0
from flair.data import Sentence, Token

from danlp.datasets import DDT
from danlp.models import load_spacy_model, load_flair_pos_model

# benchmarking polyglotmodel requires
from polyglot.tag import POSTagger
from polyglot.text import WordList

import os
import spacy

# load the data
ddt = DDT()

corpus_flair = ddt.load_with_flair()
tags_true = [[tok.tags['pos'].value for tok in fs] for fs in corpus_flair.test]
num_sentences = len(tags_true)
num_tokens = sum([len(s) for s in tags_true])

ccorpus_conll = ddt.load_as_conllu(predefined_splits=True)
# the test set
sentences_tokens = []
for sent in ccorpus_conll[2]:
    sentences_tokens.append([token.form for token in sent._tokens])


def benchmark_flair_mdl():
    tagger = load_flair_pos_model()

    start = time.time()
Exemplo n.º 5
0
class TestNerDatasets(unittest.TestCase):
    def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset

    def test_write_and_read_simple_ner_dataset(self):
        sentences = [["Jeg", "gik", "en", "tur", "i", "København"],
                     ["Alexandra", "Instituttet", "arbejder", "med", "NLP"]]

        entities = [["O", "O", "O", "O", "O", "B-LOC"],
                    ["B-ORG", "I-ORG", "O", "O", "O"]]
        tmp_file = NamedTemporaryFile().name
        write_simple_ner_dataset(sentences, entities, tmp_file)

        loaded_sents, loaded_ents = read_simple_ner_dataset(tmp_file)

        self.assertEqual(sentences, loaded_sents)
        self.assertEqual(entities, loaded_ents)

    def test_ddt_dataset(self):
        train, dev, test = self.ddt.load_as_conllu(predefined_splits=True)

        self.assertIsInstance(train, Conll)
        self.assertIsInstance(dev, Conll)
        self.assertIsInstance(test, Conll)

        self.assertEqual(
            [len(train), len(dev), len(test)],
            [self.train_len, self.dev_len, self.test_len])

        full_dataset = self.ddt.load_as_conllu(predefined_splits=False)
        self.assertEqual(len(full_dataset),
                         self.train_len + self.dev_len + self.test_len)

    def test_ddt_simple_ner(self):
        train, dev, test = self.ddt.load_as_simple_ner(predefined_splits=True)

        self.assertEqual([len(
            train[0]), len(dev[0]), len(test[0])],
                         [self.train_len, self.dev_len, self.test_len])

        all_sentences, all_entities = self.ddt.load_as_simple_ner(
            predefined_splits=False)
        self.assertEqual(len(all_sentences),
                         self.train_len + self.dev_len + self.test_len)

        data = defaultdict(int)
        for entities in train[1]:
            for entity in entities:
                if "B" in entity:
                    data[entity[2:]] += 1
        self.assertDictEqual(data, {
            'ORG': 802,
            'LOC': 945,
            'PER': 1249,
            'MISC': 1007
        })

    def test_ddt_dataset_with_flair(self):
        flair_corpus = self.ddt.load_with_flair()

        self.assertIsInstance(flair_corpus, ColumnCorpus)

        flair_lens = [
            len(flair_corpus.train),
            len(flair_corpus.dev),
            len(flair_corpus.test)
        ]
        self.assertEqual(flair_lens,
                         [self.train_len, self.dev_len, self.test_len])

        ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC', b'B-MISC', b'I-ORG', b'I-PER',
            b'I-LOC', b'I-MISC', b'O', b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

    def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()

        num_sents_train = 0
        for paragraph in [
                paragraph[1] for paragraph in list(corpus.train_tuples)
        ]:
            num_sents_train += len(paragraph)

        self.assertIsInstance(corpus, GoldCorpus)
        self.assertEqual(self.train_len, num_sents_train)