def tokenize(self, text): '''Casual speech tokenizer wrapper function, closely based on nltk's version. Returns a list of words. ::param text:: tweet text ::type text:: str ''' text = _replace_html_entities(text) if not self.preserve_handles: text = re.sub(TWITTER_USER_RE, ' ', text) if not self.preserve_hashes: text = re.sub(HASH_RE, '', text) if not self.preserve_url: text = re.sub(URL_RE, ' ', text) if not self.preserve_len: text = reduce_lengthening(text) if self.regularize: text = self.R.regularize(text) if not self.preserve_emoji: text = self.strip_emoji(text) words = self.WORD_RE.findall(text) if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)) return words
def tokenize(self, text): '''Casual speech tokenizer wrapper function for Reddit, closely based on nltk's version. Returns a list of words. ::param text:: reddit text ::type text:: str ''' text = _replace_html_entities(text) if not self.preserve_handles: text = re.sub(REDDIT_USER_RE, ' ', text) if not self.preserve_hashes: text = re.sub(HASH_RE, '', text) if not self.preserve_url: text = re.sub(URL_RE, ' ', text) if not self.preserve_ellipsis: text = re.sub(ELLIPSIS_RE, ' ', text) if not self.preserve_numbers: text = re.sub(NUMBERS_RE, ' ', text) if not self.preserve_aposS: text = re.sub(r"""'[sS]\b""", '', text) if not self.preserve_len: text = reduce_lengthening(text) if self.regularize: text = self.R.regularize(text) if not self.preserve_emoji: text = self.strip_emoji(text) words = self.WORD_RE.findall(text) if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)) return words
def preprocess_tweet(text): text = casual.reduce_lengthening(text) text = cleanString(setupRegexes('twitterProAna'),text) text = ' '.join([span for notentity,span in tweetPreprocessor(text, ("urls", "users", "lists")) if notentity]) text = text.replace('\t','') text = text.replace('< ','<').replace(' >','>') text = text.replace('):', '<sadface>').replace('(:', '<smile>') text = text.replace(" 't", "t").replace('#','') return text
def tokenize(self, text): """ :param text: str :rtype: list(str) :return: a tokenized list of strings; Normalizes URLs, usernames and word lengthening depending of the attributes of the instance. """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove or replace username handles if self.strip_handles: text = remove_handles(text) elif self.normalize_usernames: text = normalize_mentions(text) if self.normalize_urls: # Shorten problematic sequences of characters text = normalize_urls(text) # Normalize word lengthening if self.reduce_len: text = HANG_RE.sub(r'\1\1\1', text) text = reduce_lengthening(text) # Tokenize: safe_text = HANG_RE.sub(r'\1\1\1', text) words = WORD_RE.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: # lower words but keep words that are all upper cases if not self.preserve_case: words = [_lowerize(w, self.keep_allupper) for w in words] words = [_stock_code(w) for w in words] return words
def tokenize(self, text): """ :param text: str :rtype: list(str) :return: a tokenized list of strings; concatenating this list returns\ the original string if `preserve_case=False` """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove username handles if self.strip_handles: text = remove_handles(text) # Normalize word lengthening if self.reduce_len: text = reduce_lengthening(text) # Shorten problematic sequences of characters safe_text = HANG_RE.sub(r"\1\1\1", text) # Tokenize: r"|<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>" custom_Re = regex.compile( r"""(%s)""" % "|".join( ( r":[^:\s]+:", r"<:[^:\s]+:[0-9]+>", r"<a:[^:\s]+:[0-9]+>", r"<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>", ) + REGEXPS ), regex.VERBOSE | regex.I | regex.UNICODE, ) words = custom_Re.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words) ) return words