def __init__(self, dataset: List[str] = [], min_term_frequency: float = 1, max_document_frequency: float = 0, lowercase: bool = False, tokenizer_strategy: TokenizerStrategy = TokenizerStrategy. NLTK_BASE, tokenizer_language: str = 'english'): """ :param dataset: corpus used to calculate vocabulary :param min_term_frequency: min number of occurrences of word across whole corpus, if a value in the interval of ]0,1[ is used, this is considered as a fraction of the total word count, otherwise absolute count :param max_document_frequency: maximum number of documents the word is allowed to appear on average (multiple occurrences within a single document count too); if a value in the interval of ]0,1[ is used, this is considered as a fraction of the total document count :param lowercase: convert all texts to lowercase :param tokenizer_strategy: strategy used to obtain individual words :param tokenizer_language: text-language used at tokenization """ self._dataset = dataset self._min_tf = min_term_frequency self._max_df = max_document_frequency self.min_tf = 1 self.max_df = None self.vocabulary = {} self._lowercase = lowercase self._tokenizer_language = tokenizer_language self._tokenizer_strategy = tokenizer_strategy self._cva = CountVectorizerAnalyzer(dataset, lowercase=lowercase, language=tokenizer_language, strategy=tokenizer_strategy) super().__init__()
def load_parameters(self, dataset: List[str], **kwargs): self._dataset = dataset self._cva = CountVectorizerAnalyzer(dataset, lowercase=self._lowercase, language=self._tokenizer_language, strategy=self._tokenizer_strategy) super().load_parameters(**kwargs)
def test_nltk_base_vectorizer_batched(self): cva = CountVectorizerAnalyzer(texts, strategy=TokenizerStrategy.NLTK_BASE) bm = cva.extract_batch_metrics() # 2 question marks in third sentence self.assertEqual( 2, bm[2, cva.count_vectorizer.get_feature_names_out().tolist(). index('?')])
def build_word_vocab( texts: List[str] = None, max_n=None, count_vectorizer_analyzer: CountVectorizerAnalyzer = None ) -> List[str]: count_vectorizer_analyzer = count_vectorizer_analyzer or CountVectorizerAnalyzer( texts) top_words = count_vectorizer_analyzer.extract_dataset_metric().index if max_n is None: return top_words.to_list() return top_words[:max_n].to_list()
def test_wordpunkt_vectorizer(self): df_res = CountVectorizerAnalyzer( texts, strategy=TokenizerStrategy.WORD_PUNKT).extract_dataset_metric() self.assertEqual( { '?': 4, ',': 3, 'you': 3, 'in': 2, 'words': 2, 'how': 2, 'handle': 2, 'do': 2, 'sentence': 1, 'no': 1, 'of': 1, 'otheruser12': 1, 'right': 1, 's': 1, 'spaces': 1, 'mighta': 1, 'the': 1, 'this': 1, 'twitter': 1, 'u': 1, 'uppercase': 1, 'username213': 1, 'vocabulary': 1, 'xd': 1, 'missing': 1, '.?': 1, 'many': 1, '3': 1, ':': 1, '<': 1, '.': 1, '?@': 1, '@': 1, 'also': 1, 'and': 1, 'are': 1, 'bedifferent': 1, 'can': 1, 'compound': 1, 'distinguish': 1, 'ehm': 1, 'hu': 1, '-': 1, 'lowercase': 1 }, df_res.to_dict()['count'])
def test_twitter_vectorizer(self): df_res = CountVectorizerAnalyzer( texts, strategy=TokenizerStrategy.NLTK_TWEET).extract_dataset_metric() # twitter handles are preserved, emojis are preserved (<3) self.assertEqual( { '?': 6, ',': 3, 'you': 3, 'in': 2, '.': 2, 'how': 2, 'handle': 2, 'do': 2, 'missing': 1, 'of': 1, 'right': 1, 's': 1, 'sentence': 1, 'spaces': 1, 'the': 1, 'this': 1, 'twitter': 1, 'u': 1, 'uppercase': 1, 'vocabulary': 1, 'words': 1, 'xd': 1, 'no': 1, '<3': 1, 'mighta': 1, ':': 1, 'lowercase': 1, 'hu': 1, 'ehm': 1, 'distinguish': 1, 'compound-words': 1, 'can': 1, 'bedifferent': 1, 'are': 1, 'and': 1, 'also': 1, '@username213': 1, '@otheruser12': 1, 'many': 1 }, df_res.to_dict()['count'])
def test_nltk_base_vectorizer_de(self): df_res = CountVectorizerAnalyzer(texts, strategy=TokenizerStrategy.NLTK_BASE, language='german') \ .extract_dataset_metric() # note u.s. is now split into 'u.s' and '.' (compare to english nltk base) self.assertEqual( { '?': 6, ',': 3, 'you': 3, 'do': 2, '@': 2, 'in': 2, 'how': 2, 'handle': 2, ':': 1, 'otheruser12': 1, 'right': 1, 'sentence': 1, 'spaces': 1, 'the': 1, 'this': 1, 'no': 1, 'twitter': 1, 'u.s': 1, 'uppercase': 1, 'username213': 1, 'vocabulary': 1, 'words': 1, 'xd': 1, 'of': 1, 'many': 1, 'missing': 1, 'mighta': 1, '<': 1, '.': 1, 'hu': 1, 'ehm': 1, 'distinguish': 1, 'compound-words': 1, 'can': 1, 'bedifferent': 1, 'are': 1, 'and': 1, 'also': 1, '3': 1, 'lowercase': 1 }, df_res.to_dict()['count'])
def test_spacy_vectorizer_de(self): df_res = CountVectorizerAnalyzer(texts, strategy=TokenizerStrategy.SPACY, language='german') \ .extract_dataset_metric() # interestingly 'compound-words' is treated differently in comparison to the english spacy model self.assertEqual( { '?': 5, 'you': 3, ',': 3, 'do': 2, 'how': 2, 'handle': 2, 'in': 2, 'also': 1, 'missing': 1, 'xd': 1, 'words': 1, 'vocabulary': 1, 'uppercase': 1, 'u.s': 1, 'twitter': 1, 'this': 1, 'the': 1, 'spaces': 1, 'sentence': 1, 'right?@username213': 1, 'of': 1, 'no': 1, 'mighta': 1, 'and': 1, 'many': 1, 'lowercase': 1, '.': 1, 'hu': 1, ':': 1, '<3': 1, 'ehm': 1, '@otheruser12': 1, 'distinguish': 1, 'compound-words': 1, 'can': 1, 'bedifferent': 1, 'are': 1, ' ': 1 }, df_res.to_dict()['count'])
def test_regex_vectorizer_with_casing(self): df_res = CountVectorizerAnalyzer(texts, strategy=TokenizerStrategy.REGEX, lowercase=False) \ .extract_dataset_metric() self.assertEqual( { 'in': 2, 'do': 2, 'words': 2, 'handle': 2, 'you': 2, 'How': 2, 'Twitter': 1, 'miGhta': 1, 'xD': 1, 'vocabulary': 1, 'username213': 1, 'uppercase': 1, 'this': 1, 'the': 1, 'spaces': 1, 'sentence': 1, 'right': 1, 'otheruser12': 1, 'of': 1, 'missing': 1, 'many': 1, 'U': 1, 'lowercase': 1, 'Ehm': 1, 'hu': 1, 'No': 1, 'S': 1, 'distinguish': 1, 'compound': 1, 'can': 1, 'bedifferent': 1, 'are': 1, 'and': 1, 'also': 1, 'You': 1, '3': 1 }, df_res.to_dict()['count'])
def test_python_vectorizer(self): df_res = CountVectorizerAnalyzer( texts, strategy=TokenizerStrategy.PYTHON).extract_dataset_metric() self.assertEqual( { 'you': 3, 'in': 2, 'how': 2, 'handle': 2, 'the': 1, 'right?@username213:': 1, 'sentence?': 1, 'spaces?no?': 1, 'this': 1, 'missing': 1, 'twitter': 1, 'u.s.?': 1, 'uppercase': 1, 'vocabulary': 1, 'words': 1, 'xd': 1, 'of': 1, '<3': 1, 'mighta': 1, '@otheruser12': 1, 'lowercase,': 1, 'ehm,do': 1, 'do': 1, 'distinguish': 1, 'compound-words': 1, 'can': 1, 'bedifferent,hu?': 1, 'are': 1, 'and': 1, 'also': 1, 'many': 1 }, df_res.to_dict()['count'])
class VocabularyWordSimplifier(BaseSimplifier): def __init__(self, dataset: List[str] = [], min_term_frequency: float = 1, max_document_frequency: float = 0, lowercase: bool = False, tokenizer_strategy: TokenizerStrategy = TokenizerStrategy. NLTK_BASE, tokenizer_language: str = 'english'): """ :param dataset: corpus used to calculate vocabulary :param min_term_frequency: min number of occurrences of word across whole corpus, if a value in the interval of ]0,1[ is used, this is considered as a fraction of the total word count, otherwise absolute count :param max_document_frequency: maximum number of documents the word is allowed to appear on average (multiple occurrences within a single document count too); if a value in the interval of ]0,1[ is used, this is considered as a fraction of the total document count :param lowercase: convert all texts to lowercase :param tokenizer_strategy: strategy used to obtain individual words :param tokenizer_language: text-language used at tokenization """ self._dataset = dataset self._min_tf = min_term_frequency self._max_df = max_document_frequency self.min_tf = 1 self.max_df = None self.vocabulary = {} self._lowercase = lowercase self._tokenizer_language = tokenizer_language self._tokenizer_strategy = tokenizer_strategy self._cva = CountVectorizerAnalyzer(dataset, lowercase=lowercase, language=tokenizer_language, strategy=tokenizer_strategy) super().__init__() def _init_statistics(self): self.min_tf = 1.0 df_vocab = self._cva.extract_dataset_metric() if self._min_tf < 1: self.min_tf = float(self._min_tf * df_vocab.sum()) elif self._min_tf > 1: self.min_tf = self._min_tf if not self._max_df: self.max_df = None elif self._max_df < 1: self.max_df = float(self._max_df * len(self._dataset)) else: self.max_df = self._max_df df_vocab['count'] = df_vocab['count'].astype(float) selection_clause = df_vocab['count'] >= self.min_tf if self.max_df: selection_clause &= df_vocab['count'] <= self.max_df self.vocabulary = set(df_vocab[selection_clause].index) def can_init_statistics(self) -> bool: return bool(self._dataset) def simplify_text(self, text: str) -> str: if self._lowercase: text = text.lower() tokens = self._cva.tokenizer.tokenize_text(text) text = ' '.join( [token for token in tokens if token in self.vocabulary]) return text def load_parameters(self, dataset: List[str], **kwargs): self._dataset = dataset self._cva = CountVectorizerAnalyzer(dataset, lowercase=self._lowercase, language=self._tokenizer_language, strategy=self._tokenizer_strategy) super().load_parameters(**kwargs)