def tokenize(self, text): '''Casual speech tokenizer wrapper function, closely based on nltk's version. Returns a list of words. ::param text:: tweet text ::type text:: str ''' text = _replace_html_entities(text) if not self.preserve_handles: text = re.sub(TWITTER_USER_RE, ' ', text) if not self.preserve_hashes: text = re.sub(HASH_RE, '', text) if not self.preserve_url: text = re.sub(URL_RE, ' ', text) if not self.preserve_len: text = reduce_lengthening(text) if self.regularize: text = self.R.regularize(text) if not self.preserve_emoji: text = self.strip_emoji(text) words = self.WORD_RE.findall(text) if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)) return words
def tokenize(self, text): text = _replace_html_entities(text) for regexp, substitution in self.STARTING_QUOTES: text = regexp.sub(substitution, text) for regexp, substitution in self.PUNCTUATION: text = regexp.sub(substitution, text) text = " " + text + " " # split contractions for regexp, substitution in self.ENDING_QUOTES: text = regexp.sub(substitution, text) for regexp in self.CONTRACTIONS: text = regexp.sub(r' \1 \2 ', text) # handle emojis for emoticon in list(EMOTICON_RE.finditer(text))[::-1]: pos = emoticon.span()[0] if text[pos - 1] != ' ': text = text[:pos] + ' ' + text[pos:] return text.split()
def process_headlines(row): headline = row['headline'] headline = _replace_html_entities(headline) headline = re.sub('\s+', ' ', headline).lstrip().rstrip() headline = ' '.join(nltk.word_tokenize(headline)) headline = re.sub(r'\d', '%', headline) return headline
def tokenize(self, text): """ :param text: str :rtype: list(str) :return: a tokenized list of strings; concatenating this list returns\ the original string if `preserve_case=False` """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove username handles if self.strip_handles: text = remove_handles(text) # Normalize word lengthening if self.reduce_len: text = reduce_lengthening(text) # Shorten problematic sequences of characters # --- IF YOU DO NOT WANT TO REDUCE LEN : COMMENT LINE 46 --- # safe_text = HANG_RE.sub(r'\1\1\1', text) safe_text = text # Tokenize: words = WORD_RE.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words) ) return words
def tokenize(self, text): '''Casual speech tokenizer wrapper function for Reddit, closely based on nltk's version. Returns a list of words. ::param text:: reddit text ::type text:: str ''' text = _replace_html_entities(text) if not self.preserve_handles: text = re.sub(REDDIT_USER_RE, ' ', text) if not self.preserve_hashes: text = re.sub(HASH_RE, '', text) if not self.preserve_url: text = re.sub(URL_RE, ' ', text) if not self.preserve_ellipsis: text = re.sub(ELLIPSIS_RE, ' ', text) if not self.preserve_numbers: text = re.sub(NUMBERS_RE, ' ', text) if not self.preserve_aposS: text = re.sub(r"""'[sS]\b""", '', text) if not self.preserve_len: text = reduce_lengthening(text) if self.regularize: text = self.R.regularize(text) if not self.preserve_emoji: text = self.strip_emoji(text) words = self.WORD_RE.findall(text) if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)) return words
def tokenize_leads(row): leads = [] for lead in row['all_leads']: lead = _replace_html_entities(lead) lead = re.sub('\s+', ' ', lead).lstrip().rstrip() lead = ' '.join(word_tokenize(lead)) lead = re.sub(r'\d', '%', lead) leads.append(lead) return leads
def replace_html_entities(txt): """Replace the html entities in text with corresponding unicode entities. Uses UTF-8 encoding. Args: txt (str): input string """ return casual._replace_html_entities(txt)
def text_filter(self, text): text = text.replace('|||', ' ') text = _replace_html_entities(text) if text and self.merge_repeated_punc: text = self.compiled_regexes["REPEAT_PUNCTS"].sub( lambda w: handle_repeated_puncts(w), text) text = self.compiled_regexes["REPEAT_CHARS"].sub( lambda w: handle_repeated_puncts(w), text) return text
def word_tknz_artsentence(row): sentences = row['article_sentences'].split('\n\n') proc_sent = [] for sent in sentences: sent = _replace_html_entities(sent) sent = re.sub(non_bmp, ' ', sent) sent = re.sub(dotted, ' ', sent) sent = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', sent, flags=re.MULTILINE) sent = re.sub('\s+', ' ', sent).lstrip().rstrip() sent = ' '.join(nltk.word_tokenize(sent)) proc_sent.append(sent) tknz_sent = list( map(lambda sent: ' '.join(nltk.word_tokenize(sent)), proc_sent)) dgtz_sent = list(map(lambda sent: re.sub(r'\d', '%', sent), tknz_sent)) return ' '.join(dgtz_sent)
def tokenize(self, text): """ :param text: str :rtype: list(str) :return: a tokenized list of strings; Normalizes URLs, usernames and word lengthening depending of the attributes of the instance. """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove or replace username handles if self.strip_handles: text = remove_handles(text) elif self.normalize_usernames: text = normalize_mentions(text) if self.normalize_urls: # Shorten problematic sequences of characters text = normalize_urls(text) # Normalize word lengthening if self.reduce_len: text = HANG_RE.sub(r'\1\1\1', text) text = reduce_lengthening(text) # Tokenize: safe_text = HANG_RE.sub(r'\1\1\1', text) words = WORD_RE.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: # lower words but keep words that are all upper cases if not self.preserve_case: words = [_lowerize(w, self.keep_allupper) for w in words] words = [_stock_code(w) for w in words] return words
def clean(self, text): if not self.type_include: tknzr = NLTK.TweetTokenizer() return tknzr.tokenize(text) # Fix HTML character entities: text = NLTK._replace_html_entities(text) # Shorten problematic sequences of characters safe_text = NLTK.HANG_RE.sub(r'\1\1\1', text) # Tokenize: words = WORD_RE.findall(safe_text) clean_text = text # # Possibly alter the case, but avoid changing emoticons like :D into :d: for i, x in enumerate(words[:]): # if EMOTICON_RE.match(x) or EMOJI_RE.match(x): # text.decode('utf8') if URLS_RE.match(x) or EMAIL_RE.match(x): # print "url" clean_text = clean_text.replace(x, '') elif USERNAME_RE.match(x): # print "Username" clean_text = clean_text.replace(x, '') elif HASHTAG_RE.match(x): # print "tag" clean_text = clean_text.replace(x, '') elif PHONUM_RE.match(x): # print "phone" clean_text = clean_text.replace(x, '') # elif x.lower() in STOP: # print "stop" # clean_text = clean_text.replace(x, '') # elif EMOJI_RE.match(x): # clean_text = clean_text.replace(x, '') else: continue return clean_text
def tokenize(self, text): """ :param text: str :rtype: list(str) :return: a tokenized list of strings; concatenating this list returns\ the original string if `preserve_case=False` """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove username handles if self.strip_handles: text = remove_handles(text) # Normalize word lengthening if self.reduce_len: text = reduce_lengthening(text) # Shorten problematic sequences of characters safe_text = HANG_RE.sub(r"\1\1\1", text) # Tokenize: r"|<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>" custom_Re = regex.compile( r"""(%s)""" % "|".join( ( r":[^:\s]+:", r"<:[^:\s]+:[0-9]+>", r"<a:[^:\s]+:[0-9]+>", r"<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>", ) + REGEXPS ), regex.VERBOSE | regex.I | regex.UNICODE, ) words = custom_Re.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words) ) return words
def process_tweets(row): tweet_tkns = row['tweet'].split() twt_clean = [ tkn for idx, tkn in enumerate(tweet_tkns) if 'https' not in tkn and 'RT' != tkn and not (idx == 1 and '@' in tkn) and not (idx == 0 and '@' in tkn) ] twt_clean = twt_clean[:-1] if '@' in twt_clean[-1] else twt_clean tweet = ' '.join(twt_clean) tweet = _replace_html_entities(tweet) reduce_lengthening = re.compile(r"(.)\1{2,}") tweet = reduce_lengthening.sub(r"\1\1\1", tweet) remode_handles = re.compile( r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|" r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)") tweet = remode_handles.sub(' ', tweet) tweet = re.sub(non_bmp, ' ', tweet) tweet = re.sub(dotted, ' ', tweet) tweet = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', tweet, flags=re.MULTILINE) tweet = re.sub('\s+', ' ', tweet).lstrip().rstrip() tweet = ' '.join(nltk.word_tokenize(tweet)) tweet = re.sub(r'\d', '%', tweet) # return ' '.join(tweet_tknzr.tokenize(' '.join(twt_clean))), ' '.join(nltk.word_tokenize(' '.join(twt_clean))) return tweet
def tokenize(self, text): if not self.type_include: tknzr = NLTK.TweetTokenizer() return tknzr.tokenize(text) # Fix HTML character entities: text = NLTK._replace_html_entities(text) # Shorten problematic sequences of characters safe_text = NLTK.HANG_RE.sub(r'\1\1\1', text) # Tokenize: words = WORD_RE.findall(safe_text) # # Possibly alter the case, but avoid changing emoticons like :D into :d: for i, x in enumerate(words[:]): if EMOTICON_RE.match(x) or EMOJI_RE.match(x): words[i] = (x, 'E') elif URLS_RE.match(x) or EMAIL_RE.match(x): words[i] = (x, 'U') elif USERNAME_RE.match(x): words[i] = (x, 'USR') elif HASHTAG_RE.match(x): words[i] = (x, 'H') elif PHONUM_RE.match(x): words[i] = (x, 'PN') elif x.lower() in STOP: words[i] = (x, 'S') elif x in PUNCTUATION: words[i] = (x, 'PUNC') else: words[i] = (x, 'N') return words # tz = MyTweetTokenizer() # t ="RT @team_staystrong: #np COOL FOR THE SUMMER #DemiLovato" # print(tz.tokenize(t))
def unicodeReplacement(tweet): return _replace_html_entities(tweet)
def preprocess(tweet_text): return URL_RE.sub(' ', remove_handles(_replace_html_entities(tweet_text)))