Exemplo n.º 1
0
 def test_preprocess(self):
     pr = preprocess.Preprocessor(
         tokenizer=preprocess.RegexpTokenizer('\w+'),
         pos_tagger=tag.AveragedPerceptronTagger())
     corpus = Corpus.from_file('deerwester')
     pr(corpus, inplace=True)
     self.assertIsNotNone(corpus.pos_tags)
Exemplo n.º 2
0
    def __init__(self, parent=None):
        super().__init__(parent)
        self.corpus = None
        self.initial_ngram_range = None  # initial range of input corpus — used for inplace
        self.preprocessor = preprocess.Preprocessor()

        # -- INFO --
        info_box = gui.widgetBox(self.controlArea, 'Info')
        info_box.setFixedWidth(self.control_area_width)
        self.controlArea.layout().addStretch()
        self.info_label = gui.label(info_box, self, '')
        self.update_info()

        # -- PIPELINE --
        frame = QFrame()
        frame.setContentsMargins(0, 0, 0, 0)
        frame.setFrameStyle(QFrame.Box)
        frame.setStyleSheet('.QFrame { border: 1px solid #B3B3B3; }')
        frame_layout = QVBoxLayout()
        frame_layout.setContentsMargins(0, 0, 0, 0)
        frame_layout.setSpacing(0)
        frame.setLayout(frame_layout)

        self.stages = []
        for stage in self.preprocessors:
            widget = stage(self)
            self.stages.append(widget)
            setattr(self, stage.attribute, widget)
            frame_layout.addWidget(widget)
            widget.change_signal.connect(self.settings_invalidated)

        frame_layout.addStretch()
        self.scroll = QScrollArea()
        self.scroll.setWidget(frame)
        self.scroll.setWidgetResizable(True)
        self.scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff)
        self.scroll.setVerticalScrollBarPolicy(Qt.ScrollBarAlwaysOn)
        self.scroll.resize(frame_layout.sizeHint())
        self.scroll.setMinimumHeight(500)
        self.set_minimal_width()
        self.mainArea.layout().sizeHint()
        self.mainArea.layout().addWidget(self.scroll)

        # Buttons area
        self.report_button.setFixedWidth(self.control_area_width)

        commit_button = gui.auto_commit(self.buttonsArea,
                                        self,
                                        'autocommit',
                                        'Commit',
                                        box=False)
        commit_button.setFixedWidth(self.control_area_width)

        self.buttonsArea.layout().addWidget(commit_button)
Exemplo n.º 3
0
 def test_ngrams(self):
     vect = BowVectorizer()
     corpus = Corpus.from_file('deerwester')
     pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'),
                                  ngrams_range=(1, 3))
     pr(corpus, inplace=True)
     result = vect.transform(corpus)
     attrs = [attr.name for attr in result.domain.attributes]
     self.assertIn(corpus.tokens[0][1], attrs)
     self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
     self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
Exemplo n.º 4
0
    def test_copy(self):
        corpus = Corpus.from_file('deerwester')

        p = preprocess.Preprocessor(
            tokenizer=preprocess.RegexpTokenizer('\w+\s}'))
        copied = corpus.copy()
        p(copied, inplace=True)
        self.assertIsNot(copied, corpus)
        self.assertNotEqual(copied, corpus)

        p(corpus, inplace=True)
        copied = corpus.copy()
        self.assertIsNot(copied, corpus)
        self.assertEqual(copied, corpus)
Exemplo n.º 5
0
def pre_process(path):
    corpus = orangecontrib.text.Corpus.from_file(path)

    p = preprocess.Preprocessor(
        transformers=[
            preprocess.LowercaseTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer()
        ],
        tokenizer=preprocess.RegexpTokenizer('\w+'),
        normalizer=preprocess.PorterStemmer(),
        filters=[
            preprocess.StopwordsFilter('english'),
            preprocess.RegexpFilter(
                '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<'
            )
        ])
    return p(corpus)
Exemplo n.º 6
0
 def test_empty_corpus(self):
     p = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer(
         pattern='unmatchable'))
     empty = p(self.corpus)
     self.assertIsNone(self.model.fit(empty))