def process_text(self, text): print("inside") stopwords = set([i.lower() for i in self.stopwords]) regexp = re.compile(r'(' + '|'.join(TWEET_REGEXES) + ')', re.VERBOSE | re.IGNORECASE) words = re.findall(regexp, text) # remove stopwords words = [word for word in words if word.lower() not in stopwords] words = [ word for word in words if word not in FRONT_PUNCTUATION + USELESS_PUNCTUATION ] # remove 's words = [ word[:-2] if word.lower().endswith("'s") else word for word in words ] # remove numbers words = [word for word in words if not word.isdigit()] if self.collocations: word_counts = unigrams_and_bigrams(words, self.normalize_plurals) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts
def process_text(self, text: str) -> Dict[str, int]: """ Splits a long text into words. If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer. If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from text`. :param text: The text we want to process :return: a dictionary. keys are words and values are the frequencies. """ flags = ( re.UNICODE if version < '3' and type(text) is unicode # noqa: F821 else 0) if self.persian_normalize: normalizer = Normalizer() text = normalizer.normalize(text) if not self.include_numbers: text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text) if self.regexp: words = re.findall(self.regexp, text, flags) else: words = word_tokenize(text) if self.collocations: word_counts = unigrams_and_bigrams(words, self.normalize_plurals) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts
def process_text(self, text): """Splits a long text into words, eliminates the stopwords. Parameters ---------- text : string The text to be processed. Returns ------- words : dict (string, int) Word tokens with associated frequency. ..versionchanged:: 1.2.2 Changed return type from list of tuples to dict. Notes ----- There are better ways to do word tokenization, but I don't want to include all those things. """ stopwords = set([i.lower() for i in self.stopwords]) flags = (re.UNICODE if sys.version < '3' and type(text) is unicode else 0) regexp = self.regexp if self.regexp is not None else r"\w[\w']+" words = re.findall(regexp, text, flags) # remove stopwords words = [word for word in words if word.lower() not in stopwords] # remove 's words = [ word[:-2] if word.lower().endswith("'s") else word for word in words ] # remove numbers words = [word for word in words if not word.isdigit()] # remove arabic characters if self.only_persian: words = [self.remove_ar(word) for word in words] if self.collocations: word_counts = unigrams_and_bigrams(words, self.normalize_plurals) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts
def process_text(self, text: str) -> Dict[str, int]: """ Splits a long text into words. If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer. If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from text`. Attention: this method will not remove stopwords from the input. :param text: The text we want to process :return: a dictionary. keys are words and values are the frequencies. """ flags = ( re.UNICODE if version < '3' and type(text) is unicode # noqa: F821 else 0) if self.remove_unhandled_utf_characters: text = WordCloudFa.unhandled_characters_regex.sub(r'', text) if self.persian_normalize: normalizer = Normalizer() text = normalizer.normalize(text) if not self.include_numbers: text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text) if self.regexp: words = re.findall(self.regexp, text, flags) else: words = word_tokenize(text) if self.collocations: # We remove stopwords in the WordCloudFa, so there is no need for passing them in this function. word_counts = unigrams_and_bigrams(words, [], self.normalize_plurals, self.collocation_threshold) else: word_counts, _ = process_tokens(words, self.normalize_plurals) return word_counts