Exemplo n.º 1
0
    def test_pickle_corpus(self):
        """
        Corpus must be picklable (for save data widget)
        gh-590
        """
        c = Corpus.from_file('book-excerpts')

        # it must also work with preprocessed corpus
        self.pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.WordPunctTokenizer(),
            preprocess.SnowballStemmer(),
            preprocess.FrequencyFilter(),
            preprocess.StopwordsFilter()
        ]
        for pp in self.pp_list:
            c = pp(c)
        pickle.dumps(c)
Exemplo n.º 2
0
    def test_preprocess_words(self):
        corpus = Corpus.from_file("book-excerpts")
        words = [
            "House",
            "dóctor",
            "boy",
            "way",
            "Rum https://google.com",
            "https://google.com",
            "<p>abra<b>cadabra</b><p>",
        ]

        pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.StripAccentsTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer(),
        ]
        for p in pp_list:
            corpus = p(corpus)

        self.assertListEqual(
            ["house", "doctor", "boy", "way", "rum", "abracadabra"],
            _preprocess_words(corpus, words, dummy_callback),
        )

        words = ["House", "dóctor", "boys", "way", "Rum"]

        pp_list = [preprocess.SnowballStemmer()]
        for p in pp_list:
            corpus = p(corpus)

        self.assertListEqual(
            ["hous", "doctor", "boy", "way", "rum"],
            _preprocess_words(corpus, words, dummy_callback),
        )
Exemplo n.º 3
0
 def test_snowball(self):
     stemmer = preprocess.SnowballStemmer('french')
     token = 'voudrais'
     self.assertEqual(stemmer._preprocess(token),
                      nltk.SnowballStemmer(language='french').stem(token))
Exemplo n.º 4
0
 def test_call_snowball(self):
     pp = preprocess.SnowballStemmer()
     self.assertFalse(self.corpus.has_tokens())
     corpus = pp(self.corpus)
     self.assertTrue(corpus.has_tokens())
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
Exemplo n.º 5
0
        self.view.selectionModel().select(selection,
                                          QItemSelectionModel.ClearAndSelect)


if __name__ == "__main__":
    from orangewidget.utils.widgetpreview import WidgetPreview

    from orangecontrib.text import preprocess

    corpus = Corpus.from_file("book-excerpts")
    # corpus.set_title_variable("Text")

    pp_list = [
        preprocess.LowercaseTransformer(),
        preprocess.StripAccentsTransformer(),
        preprocess.SnowballStemmer(),
    ]
    for p in pp_list:
        corpus = p(corpus)

    w = StringVariable("Words")
    w.attributes["type"] = "words"
    words = ["house", "doctor", "boy", "way", "Rum"]
    words = Table(
        Domain([], metas=[w]),
        np.empty((len(words), 0)),
        metas=np.array(words).reshape((-1, 1)),
    )
    WidgetPreview(OWScoreDocuments).run(set_data=corpus, set_words=words)
Exemplo n.º 6
0
    def test_str(self):
        stemmer = preprocess.PorterStemmer()
        self.assertIn('porter', str(stemmer).lower())

        stemmer = preprocess.SnowballStemmer('french')
        self.assertIn('french', str(stemmer).lower())