Exemplo n.º 1
0
    def setUp(self) -> None:
        self.widget: OWScoreDocuments = self.create_widget(OWScoreDocuments)

        # create corpus
        self.corpus = Corpus.from_file("book-excerpts")
        pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.StripAccentsTransformer(),
            preprocess.SnowballStemmer(),
        ]
        for p in pp_list:
            self.corpus = p(self.corpus)

        # create words table
        words = ["house", "doctor", "boy", "way", "Rum"]
        self.words = self.create_words_table(words)
Exemplo n.º 2
0
    def test_preprocess_words(self):
        corpus = Corpus.from_file("book-excerpts")
        words = [
            "House",
            "dóctor",
            "boy",
            "way",
            "Rum https://google.com",
            "https://google.com",
            "<p>abra<b>cadabra</b><p>",
        ]

        pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.StripAccentsTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer(),
        ]
        for p in pp_list:
            corpus = p(corpus)

        self.assertListEqual(
            ["house", "doctor", "boy", "way", "rum", "abracadabra"],
            _preprocess_words(corpus, words, dummy_callback),
        )

        words = ["House", "dóctor", "boys", "way", "Rum"]

        pp_list = [preprocess.SnowballStemmer()]
        for p in pp_list:
            corpus = p(corpus)

        self.assertListEqual(
            ["hous", "doctor", "boy", "way", "rum"],
            _preprocess_words(corpus, words, dummy_callback),
        )
Exemplo n.º 3
0
 def test_strip_accents(self):
     transformer = preprocess.StripAccentsTransformer()
     self.assertEqual(transformer._preprocess('Abra'), 'Abra')
     self.assertEqual(transformer._preprocess('\u00C0bra'), 'Abra')
Exemplo n.º 4
0
        self.view.selectionModel().select(selection,
                                          QItemSelectionModel.ClearAndSelect)


if __name__ == "__main__":
    from orangewidget.utils.widgetpreview import WidgetPreview

    from orangecontrib.text import preprocess

    corpus = Corpus.from_file("book-excerpts")
    # corpus.set_title_variable("Text")

    pp_list = [
        preprocess.LowercaseTransformer(),
        preprocess.StripAccentsTransformer(),
        preprocess.SnowballStemmer(),
    ]
    for p in pp_list:
        corpus = p(corpus)

    w = StringVariable("Words")
    w.attributes["type"] = "words"
    words = ["house", "doctor", "boy", "way", "Rum"]
    words = Table(
        Domain([], metas=[w]),
        np.empty((len(words), 0)),
        metas=np.array(words).reshape((-1, 1)),
    )
    WidgetPreview(OWScoreDocuments).run(set_data=corpus, set_words=words)