예제 #1
0
    def test_tokenizer_choice(self):
        test_sentence = 'The quick #brown fox jumps over the lazy dog :-)'
        default_tokens = [
            'the', 'quick', '#', 'brown', 'fox', 'jumps', 'over', 'the',
            'lazy', 'dog', ':', '-', ')'
        ]
        no_punct_tokens = [
            'the', 'quick', 'brown', 'fox', 'jumps', 'over','the', 'lazy',
            'dog'
        ]
        tweet_tokens = [
            'the', 'quick', '#brown', 'fox', 'jumps', 'over', 'the', 'lazy',
            'dog', ':-)'
        ]

        p = Preprocessor(tokenizer='default')
        self.assertEqual(type(p.tokenizer), type(word_tokenize))
        tokens = p(test_sentence)
        self.assertEqual(tokens[0], default_tokens)

        p = Preprocessor(tokenizer='no_punct')
        self.assertEqual(type(p.tokenizer),
                         type(RegexpTokenizer(r'\w+').tokenize))
        tokens = p(test_sentence)
        self.assertEqual(tokens[0], no_punct_tokens)

        p = Preprocessor(tokenizer='twitter')
        self.assertEqual(type(p.tokenizer), type(TweetTokenizer().tokenize))
        tokens = p(test_sentence)
        self.assertEqual(tokens[0], tweet_tokens)

        with self.assertRaises(ValueError):
            Preprocessor(tokenizer='unsupported_value')
예제 #2
0
    def test_string_processor(self):
        p = Preprocessor(transformers=preprocess.LowercaseTransformer())
        tokens = p(self.corpus).tokens
        p2 = Preprocessor(transformers=[])
        tokens2 = p2(self.corpus).tokens

        np.testing.assert_equal(tokens,
                                [[t.lower() for t in doc] for doc in tokens2])

        self.assertRaises(TypeError, Preprocessor, string_transformers=1)
예제 #3
0
    def test_token_normalizer(self):
        class CapTokenNormalizer(preprocess.BaseNormalizer):
            @classmethod
            def normalize(cls, token):
                return token.capitalize()
        p = Preprocessor(normalizer=CapTokenNormalizer())
        tokens = p(self.corpus).tokens
        p2 = Preprocessor(normalizer=None)
        tokens2 = p2(self.corpus).tokens

        np.testing.assert_equal(
            tokens, [[t.capitalize() for t in doc] for doc in tokens2])
예제 #4
0
    def test_preprocess_sentence_stopwords(self):
        # No stop words.
        p = Preprocessor()
        result = p(self.TEST_STRING)
        correct = [
            [
                'human',
                'machine',
                'interface',
                'for',
                'lab',
                'abc',
                'computer',
                'applications'
            ]
        ]
        self.assertEqual(result, correct)

        # English stop words.
        p = Preprocessor(stop_words='english')
        result = p(self.TEST_STRING)
        correct = [
            [
                'human',
                'machine',
                'interface',
                'lab',
                'abc',
                'computer',
                'applications'
            ]
        ]
        self.assertEqual(result, correct)

        # Custom stop words.
        custom_stop_words = [
            'abc',
            'applications',
            'computer',
            'for',
        ]
        p = Preprocessor(stop_words=custom_stop_words)
        result = p(self.TEST_STRING)
        correct = [
            [
                'human',
                'machine',
                'interface',
                'lab'
            ]
        ]
        self.assertEqual(result, correct)
예제 #5
0
    def test_inplace(self):
        p = Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w'))
        corpus = p(self.corpus, inplace=True)
        self.assertIs(corpus, self.corpus)

        corpus = p(self.corpus, inplace=False)
        self.assertIsNot(corpus, self.corpus)
        self.assertEqual(corpus, self.corpus)

        p = Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'))
        corpus = p(self.corpus, inplace=False)
        self.assertIsNot(corpus, self.corpus)
        self.assertNotEqual(corpus, self.corpus)
예제 #6
0
    def test_tokenize(self):
        correct = [
            ['human', 'machine', 'interface', 'for', 'lab', 'abc',
             'computer', 'applications'],
            ['a', 'survey', 'of', 'user', 'opinion', 'of', 'computer',
             'system', 'response', 'time'],
            ['the', 'eps', 'user', 'interface', 'management', 'system'],
            ['system', 'and', 'human', 'system', 'engineering', 'testing',
             'of', 'eps'],
            ['relation', 'of', 'user', 'perceived', 'response', 'time',
             'to', 'error', 'measurement'],
            ['the', 'generation', 'of', 'random', 'binary', 'unordered',
             'trees'],
            ['the', 'intersection', 'graph', 'of', 'paths', 'in', 'trees'],
            ['graph', 'minors', 'iv', 'widths', 'of', 'trees', 'and',
             'well', 'quasi', 'ordering'],
            ['graph', 'minors', 'a', 'survey']
        ]

        # String.
        p = Preprocessor(lowercase=False)
        self.assertEqual(
            p(self.TEST_STRING),
            [
                [
                    'Human',
                    'machine',
                    'interface',
                    'for',
                    'lab',
                    'abc',
                    'computer',
                    'applications',
                ]
            ]
        )

        # List.
        p = Preprocessor()
        self.assertEqual(
            p(self.TEST_LIST),
            correct
        )

        # Corpus.
        p = Preprocessor()
        self.assertEqual(
            p(self.TEST_CORPUS).tokens,
            correct
        )
예제 #7
0
 def apply(self):
     # TODO change this to custom stopwords
     if self.remove_stpwrds:
         sw = 'english'
     pp = Preprocessor(incl_punct=self.include_punctuation,
                       trans=self.transformation,
                       lowercase=self.lowercase,
                       stop_words=sw)
     self.send(Output.PREPROCESSOR, pp)
예제 #8
0
 def _compute_indices(self):  # type: () -> Optional[None, list]
     if self.corpus is None:
         self.indices = None
         return
     if self.corpus and not self.corpus.has_tokens():
         preprocessor = Preprocessor(tokenizer=WordPunctTokenizer())
         preprocessor(self.corpus)
     self.indices = [ConcordanceIndex(doc, key=lambda x: x.lower())
                     for doc in self.corpus.tokens]
예제 #9
0
    def test_tokenizer(self):
        class SpaceTokenizer(preprocess.BaseTokenizer):
            @classmethod
            def tokenize(cls, string):
                return string.split()
        p = Preprocessor(tokenizer=SpaceTokenizer())

        np.testing.assert_equal(p(self.corpus).tokens,
                         np.array([sent.split() for sent in self.corpus.documents]))
예제 #10
0
 def tokens(self):
     """
     Return a list of lists containing tokens. If tokens are not yet
     present, run default preprocessor and save tokens.
     """
     if self._tokens is None:
         from orangecontrib.text.preprocess import Preprocessor
         p = Preprocessor()
         self._tokens = p(self.documents)
     return self._tokens
예제 #11
0
    def test_token_normalizer(self):
        class CapTokenNormalizer(preprocess.BaseNormalizer):
            @classmethod
            def normalize(cls, token):
                return token.capitalize()

        p = Preprocessor(normalizer=CapTokenNormalizer())

        np.testing.assert_equal(
            p(self.corpus).tokens,
            np.array([[sent.capitalize()] for sent in self.corpus.documents]))
예제 #12
0
    def test_max_df(self):
        ff = preprocess.FrequencyFilter(max_df=.3)
        p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'),
                         filters=[ff])
        size = len(self.corpus.documents)

        corpus = p(self.corpus)
        self.assertFrequencyRange(corpus, 1, size * .3)

        ff.max_df = 2
        corpus = p(self.corpus)
        self.assertFrequencyRange(corpus, 1, 2)
예제 #13
0
 def __init__(self):
     super().__init__()
     self.n_words = 0
     self.mean_weight = 0
     self.std_weight = 0
     self.selected_words = SelectedWords(self)
     self.webview = None
     self.bow = None  # bag-of-words, obviously
     self.topics = None
     self.corpus = None
     self.PREPROCESS = Preprocessor()
     self._create_layout()
예제 #14
0
 def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
     # Compiled Regexes are NOT deepcopy-able and hence to make Corpus deepcopy-able
     # we cannot store then (due to Corpus also storing used_preprocessor for BoW compute values).
     # To bypass the problem regex is compiled before every __call__ and discarded right after.
     self.tokenizer = self.tokenizer_cls(self.__pattern)
     corpus = Preprocessor.__call__(self, corpus)
     if callback is None:
         callback = dummy_callback
     callback(0, "Tokenizing...")
     corpus = self._store_tokens_from_documents(corpus, callback)
     self.tokenizer = None
     return corpus
예제 #15
0
    def test_min_df(self):
        ff = preprocess.FrequencyFilter(min_df=.5)
        p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'),
                         filters=[ff])
        processed = p(self.corpus)
        size = len(processed.documents)
        self.assertFrequencyRange(processed, size * .5, size)

        ff.min_df = 2
        processed = p(self.corpus)
        size = len(processed.documents)
        self.assertFrequencyRange(processed, 2, size)
예제 #16
0
    def test_string_processor(self):
        class StripStringTransformer(preprocess.BaseTransformer):
            @classmethod
            def transform(cls, string):
                return string[:-1]

        p = Preprocessor(transformers=StripStringTransformer())

        np.testing.assert_equal(
            p(self.corpus).tokens,
            np.array([[doc[:-1]] for doc in self.corpus.documents]))

        p = Preprocessor(transformers=[
            StripStringTransformer(),
            preprocess.LowercaseTransformer()
        ])

        np.testing.assert_equal(
            p(self.corpus).tokens,
            np.array([[doc[:-1].lower()] for doc in self.corpus.documents]))

        self.assertRaises(TypeError, Preprocessor, string_transformers=1)
예제 #17
0
    def test_porter_sentence(self):
        corpus = [
            'Caresses flies dies mules denied died agreed owned humbled sized.'
        ]
        stemmed = [
            'caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own',
            'humbl'
        ]

        p = Preprocessor(lowercase=True, stop_words=None, trans=Stemmer)
        corpus = p(corpus)
        print(corpus)
        self.assertEqual(sorted(corpus), sorted(stemmed))
예제 #18
0
    def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
        try:
            self.__model = udpipe.Model.load(self.models[self.__language])
        except StopIteration:
            raise UDPipeStopIteration

        if self.__use_tokenizer:
            corpus = Preprocessor.__call__(self, corpus)
            if callback is None:
                callback = dummy_callback
            callback(0, "Normalizing...")
            return self._store_tokens_from_documents(corpus, callback)
        else:
            return super().__call__(corpus, callback)
예제 #19
0
 def test_preprocess_corpus_float_df(self):
     p = Preprocessor(min_df=0.2, max_df=0.5, lowercase=True)
     result = p(self.TEST_CORPUS)
     correct = [
         ['human', 'interface', 'computer'],
         ['a', 'survey', 'user', 'computer', 'system', 'response', 'time'],
         ['the', 'eps', 'user', 'interface', 'system'],
         ['system', 'and', 'human', 'system', 'eps'],
         ['user', 'response', 'time'],
         ['the', 'trees'],
         ['the', 'graph', 'trees'],
         ['graph', 'minors', 'trees', 'and'],
         ['graph', 'minors', 'a', 'survey']
     ]
     self.assertEqual(result.tokens, correct)
예제 #20
0
    def test_token_filter(self):
        class SpaceTokenizer(preprocess.BaseTokenizer):
            @classmethod
            def tokenize(cls, string):
                return string.split()

        class LengthFilter(preprocess.BaseTokenFilter):
            @classmethod
            def check(cls, token):
                return len(token) < 4

        p = Preprocessor(tokenizer=SpaceTokenizer(), filters=LengthFilter())
        np.testing.assert_equal(p(self.corpus).tokens,
                         np.array([[token for token in doc.split() if len(token) < 4]
                                   for doc in self.corpus.documents]))
예제 #21
0
    def assemble_preprocessor(self):
        self.error(0, '')

        pp_settings = {
            # If disabled, this defaults to True, which is not what we want.
            'lowercase': False,
        }
        for pp in self.preprocessors:
            if pp.enabled:
                pp_settings.update(pp.get_pp_setting())
        pp_settings['callback'] = self.document_finished
        try:
            preprocessor = Preprocessor(**pp_settings)
        except Exception as e:
            self.error(0, str(e))
            return None
        return preprocessor
예제 #22
0
 def test_preprocess_corpus_min_df(self):
     p = Preprocessor(
         lowercase=True,
         stop_words=['for', 'a', 'of', 'the', 'and', 'to', 'in'],
         min_df=2)
     corpus = p(self.corpus)
     correct = [['human', 'interface', 'computer'],
                [
                    'survey', 'user', 'computer', 'system', 'response',
                    'time'
                ], ['eps', 'user', 'interface', 'system'],
                ['system', 'human', 'system', 'eps'],
                ['user', 'response', 'time'], ['trees'], ['graph', 'trees'],
                ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]
     self.assertEqual(len(corpus), len(correct))
     for i, j in zip(corpus, correct):
         self.assertEqual(sorted(i), sorted(j))
예제 #23
0
    def __init__(self):
        super().__init__()

        self.lda = None
        self.corpus = None
        self.preprocessor = Preprocessor()

        # Info.
        info_box = gui.widgetBox(self.controlArea, "Info")
        self.info_label = gui.label(info_box, self, '')

        # Settings.
        topic_box = gui.widgetBox(self.controlArea, "Settings")
        hbox = gui.widgetBox(topic_box, orientation=0)
        self.topics_label = gui.label(hbox, self, 'Number of topics: ')
        self.topics_label.setMaximumSize(self.topics_label.sizeHint())
        self.topics_input = gui.spin(hbox,
                                     self,
                                     "num_topics",
                                     minv=1,
                                     maxv=2**31 - 1,
                                     callback=self.num_topics_changed)

        # Commit button
        self.commit = gui.button(self.controlArea,
                                 self,
                                 "&Apply",
                                 callback=self.apply,
                                 default=True)
        self.commit.setEnabled(False)
        gui.rubber(self.controlArea)

        # Topics description
        self.cols = ['Topic', 'Topic keywords']
        self.topic_desc = QtGui.QTreeWidget()
        self.topic_desc.setColumnCount(len(self.cols))
        self.topic_desc.setHeaderLabels(self.cols)
        #self.topic_desc.setSelectionMode(QtGui.QTreeView.ExtendedSelection)
        self.topic_desc.itemSelectionChanged.connect(
            self.selected_topic_changed)
        for i in range(len(self.cols)):
            self.topic_desc.resizeColumnToContents(i)
        self.mainArea.layout().addWidget(self.topic_desc)

        self.refresh_gui()
예제 #24
0
    def test_wordnet_lemmatizer_sentence(self):
        corpus = [
            'Pursued brightness insightful blessed lies held timelessly minds.'
        ]
        lemmas = [
            'pursued',
            'brightness',
            'insightful',
            'blessed',
            'lie',
            'held',
            'timelessly',
            'mind',
            '.'
        ]

        p = Preprocessor(transformation=Lemmatizer,)
        result = p(corpus)[0]
        self.assertEqual(result, lemmas)
예제 #25
0
def preprocess_only_words(corpus: Corpus) -> Corpus:
    """
    Apply the preprocessor that splits words, transforms them to lower case
    (and removes punctuations).

    Parameters
    ----------
    corpus
        Corpus on which the preprocessor will be applied.

    Returns
    -------
    Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams.
    """
    p = Preprocessor(
        transformers=[LowercaseTransformer()],
        # by default regexp keeps only words (no punctuations, no spaces)
        tokenizer=RegexpTokenizer(),
    )
    return p(corpus, inplace=False)
예제 #26
0
    def test_snowball_sentence(self):
        corpus = [
            'Caresses flies dies mules denied died agreed owned humbled sized.'
        ]
        stemmed = [
            'caress',
            'fli',
            'die',
            'mule',
            'deni',
            'die',
            'agre',
            'own',
            'humbl',
            'size',
            '.'
        ]

        p = Preprocessor(transformation=SnowballStemmer)
        result = p(corpus)[0]
        self.assertEqual(result, stemmed)
예제 #27
0
    def test_preprocess_one_sentence_lowercase_stopwords(self):
        p = Preprocessor(lowercase=True, stop_words=None)
        corpus = p(self.sentence)
        self.assertEqual(
            sorted(corpus),
            sorted([
                'abc', 'applications', 'computer', 'for', 'human', 'interface',
                'lab', 'machine'
            ]))

        p = Preprocessor(lowercase=True, stop_words='english')
        corpus = p(self.sentence)
        self.assertEqual(
            sorted(corpus),
            sorted([
                'abc', 'applications', 'computer', 'human', 'interface', 'lab',
                'machine'
            ]))

        p = Preprocessor(lowercase=True,
                         stop_words=['abc', 'applications', 'computer', 'for'])
        corpus = p(self.sentence)
        self.assertEqual(sorted(corpus),
                         sorted(['human', 'interface', 'lab', 'machine']))

        p = Preprocessor(lowercase=False, stop_words=None)
        corpus = p(self.sentence)
        self.assertEqual(
            sorted(corpus),
            sorted([
                'Human', 'abc', 'applications', 'computer', 'for', 'interface',
                'lab', 'machine'
            ]))

        p = Preprocessor(lowercase=False, stop_words='english')
        corpus = p(self.sentence)
        self.assertEqual(
            sorted(corpus),
            sorted([
                'abc', 'applications', 'computer', 'Human', 'interface', 'lab',
                'machine'
            ]))

        p = Preprocessor(lowercase=False,
                         stop_words=['abc', 'applications', 'computer', 'for'])
        corpus = p(self.sentence)
        self.assertEqual(sorted(corpus),
                         sorted(['Human', 'interface', 'lab', 'machine']))
예제 #28
0
 def test_faulty_init_parameters(self):
     # Stop word source.
     with self.assertRaises(ValueError):
         Preprocessor(stop_words='faulty_value')
     # Transformation.
     with self.assertRaises(ValueError):
         Preprocessor(transformation='faulty_value')
     # Min/Max df.
     with self.assertRaises(ValueError):
         Preprocessor(min_df='faulty_value')
     with self.assertRaises(ValueError):
         Preprocessor(max_df='faulty_value')
     with self.assertRaises(ValueError):
         Preprocessor(min_df=1.5)
     with self.assertRaises(ValueError):
         Preprocessor(max_df=1.5)
예제 #29
0
    def __call__(self, corpus, use_tfidf=False):
        if corpus is None:
            raise ValueError(
                'Cannot compute Bag of Words without an input corpus.')

        has_tokens = hasattr(corpus, 'tokens') and corpus.tokens is not None
        if not has_tokens:  # Perform default pre-processing.
            preprocessor = Preprocessor()
            corpus = preprocessor(corpus)

        self.check_progress()  # Step 1

        dictionary = corpora.Dictionary(corpus.tokens, prune_at=np.inf)
        self.vocabulary = dictionary

        # Term frequencies.
        bag_of_words = [dictionary.doc2bow(i) for i in corpus.tokens]

        self.check_progress()  # Step 2

        if use_tfidf:
            tfidf_model = TfidfModel(bag_of_words)
            bag_of_words = tfidf_model[bag_of_words]

        self.check_progress()  # Step 3

        X = matutils.corpus2dense(bag_of_words,
                                  num_terms=len(dictionary.keys()),
                                  dtype=np.float64).T

        bow_corpus = corpus.copy()
        feats = [v for k, v in sorted(dictionary.items())]
        bow_corpus.extend_attributes(X, feats, var_attrs={'bow_feature': True})
        self.check_progress()  # Step 4

        return bow_corpus
예제 #30
0
 def set_preprocessor(self, data):
     if data is None:
         self.preprocessor = Preprocessor()
     else:
         self.preprocessor = data
     self.apply()