示例#1
0
 def test_udpipe(self):
     """Test udpipe token lemmatization"""
     normalizer = preprocess.UDPipeLemmatizer('Slovenian')
     self.corpus.metas[0, 0] = 'sem'
     corpus = normalizer(self.corpus)
     self.assertListEqual(list(corpus.tokens[0]), ['biti'])
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
 def test_udpipe_doc(self):
     """Test udpipe lemmatization with its own tokenization """
     normalizer = preprocess.UDPipeLemmatizer()
     normalizer.language = 'Slovenian'
     normalizer.use_tokenizer = True
     self.assertListEqual(normalizer.normalize_doc('Gori na gori hiša gori'),
                          ['gora', 'na', 'gora', 'hiša', 'goreti'])
示例#3
0
 def test_udpipe_doc(self):
     """Test udpipe lemmatization with its own tokenization """
     normalizer = preprocess.UDPipeLemmatizer('Slovenian', True)
     self.corpus.metas[0, 0] = 'Gori na gori hiša gori'
     corpus = normalizer(self.corpus)
     self.assertListEqual(list(corpus.tokens[0]),
                          ['gora', 'na', 'gora', 'hiša', 'goreti'])
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1)
    def test_udpipe_pickle(self):
        normalizer = preprocess.UDPipeLemmatizer()
        normalizer.language = 'English'

        loaded = pickle.loads(pickle.dumps(normalizer))
        self.assertEqual(normalizer.language, loaded.language)
        self.assertEqual(loaded.normalize_doc('peter piper pickled'),
                         ['peter', 'piper', 'pickle'])
示例#5
0
 def test_udpipe_deepcopy(self):
     normalizer = preprocess.UDPipeLemmatizer('Slovenian', True)
     copied = copy.deepcopy(normalizer)
     self.assertEqual(normalizer._UDPipeLemmatizer__language,
                      copied._UDPipeLemmatizer__language)
     self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer,
                      copied._UDPipeLemmatizer__use_tokenizer)
     self.corpus.metas[0, 0] = 'Gori na gori hiša gori'
     self.assertEqual(list(copied(self.corpus).tokens[0]),
                      ['gora', 'na', 'gora', 'hiša', 'goreti'])
示例#6
0
 def test_udpipe_pickle(self):
     normalizer = preprocess.UDPipeLemmatizer('Slovenian', True)
     loaded = pickle.loads(pickle.dumps(normalizer))
     self.assertEqual(normalizer._UDPipeLemmatizer__language,
                      loaded._UDPipeLemmatizer__language)
     self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer,
                      loaded._UDPipeLemmatizer__use_tokenizer)
     self.corpus.metas[0, 0] = 'Gori na gori hiša gori'
     self.assertEqual(list(loaded(self.corpus).tokens[0]),
                      ['gora', 'na', 'gora', 'hiša', 'goreti'])
示例#7
0
    def test_cache(self):
        normalizer = preprocess.UDPipeLemmatizer('Slovenian')
        with self.corpus.unlocked():
            self.corpus.metas[0, 0] = 'sem'
        normalizer(self.corpus)
        self.assertEqual(normalizer._normalization_cache['sem'], 'biti')
        self.assertEqual(40, len(normalizer._normalization_cache))

        # cache should not be pickled
        loaded_normalizer = pickle.loads(pickle.dumps(normalizer))
        self.assertEqual(0, len(loaded_normalizer._normalization_cache))
示例#8
0
 def test_udpipe_pickle(self):
     normalizer = preprocess.UDPipeLemmatizer('Slovenian', True)
     # udpipe store model after first call - model is not picklable
     normalizer(self.corpus)
     loaded = pickle.loads(pickle.dumps(normalizer))
     self.assertEqual(normalizer._UDPipeLemmatizer__language,
                      loaded._UDPipeLemmatizer__language)
     self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer,
                      loaded._UDPipeLemmatizer__use_tokenizer)
     with self.corpus.unlocked():
         self.corpus.metas[0, 0] = 'Gori na gori hiša gori'
     self.assertEqual(list(loaded(self.corpus).tokens[0]),
                      ['gora', 'na', 'gora', 'hiša', 'goreti'])
示例#9
0
 def test_call_UDPipe(self):
     pp = preprocess.UDPipeLemmatizer()
     self.assertFalse(self.corpus.has_tokens())
     corpus = pp(self.corpus)
     self.assertTrue(corpus.has_tokens())
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
示例#10
0
 def test_udpipe(self):
     """Test udpipe token lemmatization"""
     normalizer = preprocess.UDPipeLemmatizer()
     normalizer.language = 'Slovenian'
     self.assertEqual(normalizer('sem'), 'biti')