Exemplo n.º 1
0
    def _setup_corpus(self, text_features: List[Variable] = None) -> None:
        """
        Parameters
        ----------
        text_features
            meta attributes that are used for text mining. Infer them if None.
        """
        self.text_features = []  # list of text features for mining
        self._tokens = None
        self._dictionary = None
        self.ngram_range = (1, 1)
        self.attributes = {}
        self._pos_tags = None
        from orangecontrib.text.preprocess import PreprocessorList
        self.__used_preprocessor = PreprocessorList(
            [])  # required for compute values
        self._titles: Optional[np.ndarray] = None
        self._pp_documents = None  # preprocessed documents

        if text_features is None:
            self._infer_text_features()
        else:
            self.set_text_features(text_features)

        self._set_unique_titles()
Exemplo n.º 2
0
    def used_preprocessor(self, pp):
        from orangecontrib.text.preprocess import PreprocessorList, Preprocessor

        if isinstance(pp, PreprocessorList):
            self.__used_preprocessor = PreprocessorList(list(pp.preprocessors))
        elif isinstance(pp, Preprocessor):
            self.__used_preprocessor.preprocessors.append(pp)
        else:
            raise NotImplementedError
Exemplo n.º 3
0
    def _base_tokens(self):
        from orangecontrib.text.preprocess import BASE_TRANSFORMER, \
            BASE_TOKENIZER, PreprocessorList

        # don't use anything that requires NLTK data to assure async download
        base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
                                               BASE_TOKENIZER])
        corpus = base_preprocessors(self)
        return corpus.tokens, corpus.dictionary
Exemplo n.º 4
0
 def set_corpus(self, data=None):
     self.corpus = data
     # create preprocessed corpus upon setting data to avoid preprocessing
     # at each method run
     pp_list = [
         preprocess.LowercaseTransformer(),
         preprocess.WordPunctTokenizer()
     ]
     self.pp_corpus = PreprocessorList(pp_list)(self.corpus)
     self.commit()
Exemplo n.º 5
0
    def test_used_preprocessors(self):
        corpus1 = self.corpus.copy()
        for pp in self.pp_list:
            corpus1 = pp(corpus1)
        self.assertEqual(len(self.corpus.used_preprocessor.preprocessors), 0)
        self.assertEqual(len(corpus1.used_preprocessor.preprocessors), 5)

        self.assertEqual([8, 10, 6, 8, 9, 7, 7, 10, 4],
                         list(map(len, corpus1._tokens)))

        corpus2 = PreprocessorList(self.pp_list)(self.corpus)
        self.assertEqual(corpus1, corpus2)
Exemplo n.º 6
0
    def __init__(self,
                 domain=None,
                 X=None,
                 Y=None,
                 metas=None,
                 W=None,
                 text_features=None,
                 ids=None):
        """
        Args:
            domain (Orange.data.Domain): the domain for this Corpus
            X (numpy.ndarray): attributes
            Y (numpy.ndarray): class variables
            metas (numpy.ndarray): meta attributes; e.g. text
            W (numpy.ndarray): instance weights
            text_features (list): meta attributes that are used for
                text mining. Infer them if None.
            ids (numpy.ndarray): Indices
        """
        super().__init__()
        n_doc = _check_arrays(X, Y, metas)

        with self.unlocked_reference():
            self.X = X if X is not None else np.zeros((n_doc, 0))
            self.Y = Y if Y is not None else np.zeros((n_doc, 0))
            self.metas = metas if metas is not None else np.zeros((n_doc, 0))
            self.W = W if W is not None else np.zeros((n_doc, 0))
        self.domain = domain
        self.text_features = []  # list of text features for mining
        self._tokens = None
        self._dictionary = None
        self._ngrams_corpus = None
        self.ngram_range = (1, 1)
        self.attributes = {}
        self._pos_tags = None
        from orangecontrib.text.preprocess import PreprocessorList
        self.__used_preprocessor = PreprocessorList(
            [])  # required for compute values
        self._titles: Optional[np.ndarray] = None
        self._pp_documents = None  # preprocessed documents

        if domain is not None and text_features is None:
            self._infer_text_features()
        elif domain is not None:
            self.set_text_features(text_features)

        if ids is not None:
            self.ids = ids
        else:
            Table._init_ids(self)
        self._set_unique_titles()
Exemplo n.º 7
0
 def set_corpus(self, data=None):
     self.corpus = data
     self.pp_corpus = None
     if self.corpus is not None:
         if not self.corpus.has_tokens():
             # create preprocessed corpus upon setting data to avoid
             # preprocessing at each method run
             pp_list = [
                 preprocess.LowercaseTransformer(),
                 preprocess.WordPunctTokenizer()
             ]
             self.pp_corpus = PreprocessorList(pp_list)(self.corpus)
         else:
             self.pp_corpus = self.corpus
     self.commit.now()
Exemplo n.º 8
0
def preprocess_only_words(corpus: Corpus) -> Corpus:
    """
    Apply the preprocessor that splits words, transforms them to lower case
    (and removes punctuations).

    Parameters
    ----------
    corpus
        Corpus on which the preprocessor will be applied.

    Returns
    -------
    Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams.
    """
    p = PreprocessorList(
        [LowercaseTransformer(),
         # by default regexp keeps only words (no punctuations, no spaces)
         RegexpTokenizer()]
    )
    return p(corpus)
    def test_result(self):
        pp = PreprocessorList([BASE_TRANSFORMER, RegexpTokenizer()])
        corpus = pp(Corpus.from_file("book-excerpts")[::3])
        vect = BowVectorizer()
        corpus_vect = vect.transform(corpus)

        words = ["beheld", "events", "dragged", "basin", "visit", "have"]
        d = Domain([corpus_vect.domain[w] for w in words])
        corpus_vect = corpus_vect.transform(d)

        self.send_signal(self.widget.Inputs.data, corpus_vect)
        self.send_signal(self.widget.Inputs.selected_data, corpus_vect[:1])
        self.wait_until_finished(timeout=100000)

        np.testing.assert_array_almost_equal(
            self.widget.results.p_values,
            [0.02128, 1, 0.04255, 0.06383, 0.08511, 0.97872],
            decimal=5,
        )
        np.testing.assert_array_almost_equal(
            self.widget.results.fdr_values,
            [0.12766, 1, 0.12766, 0.12766, 0.12766, 1],
            decimal=5,
        )
Exemplo n.º 10
0
 def test_apply_preprocessors(self):
     corpus = PreprocessorList(self.pp_list)(self.corpus)
     self.assertEqual([8, 10, 6, 8, 9, 7, 7, 10, 4],
                      list(map(len, corpus._tokens)))
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 5)