def test_remove_accents(): in_outs = [ ("El niño se asustó del pingüino -- qué miedo!", "El nino se asusto del pinguino -- que miedo!"), ("Le garçon est très excité pour la forêt.", "Le garcon est tres excite pour la foret."), ] for in_, out_ in in_outs: assert preprocessing.remove_accents(in_, fast=False) == out_ assert preprocessing.remove_accents(in_, fast=True) == out_
def preprocess_sentence(sent, lower=True): """Pre-process a sentence ( via ``textacy.preprocess` module ). Args: sent (str): text. lower (bool): whether to return a lowercase string. Returns: str """ # normalize unicode sent = preprocessing.normalize_unicode(sent) # deaccent sent = preprocessing.remove_accents(sent) # replace newline chars sent = re.sub("\n|\r", " ", sent) # unpack contractions sent = contractions.fix(sent) # replace emoji symbols sent = preprocessing.replace_emojis(sent) # replace hashtags sent = preprocessing.replace_hashtags(sent) # replace user handles sent = preprocessing.replace_user_handles(sent) # replace currency symbols sent = preprocessing.replace_currency_symbols(sent) # replace emails sent = preprocessing.replace_emails(sent) # replace URLs sent = preprocessing.replace_urls(sent) # remove punctuation sent = preprocessing.remove_punctuation(sent) # normalize whitespace sent = preprocessing.normalize_whitespace(sent) if lower: sent = sent.lower() return sent
def text_cleanup(text): "cleanup our text" text = preprocessing.replace_emails(text, replace_with='') text = preprocessing.replace_urls(text, replace_with='') text = preprocessing.replace_hashtags(text, replace_with='') text = preprocessing.replace_phone_numbers(text, replace_with='') text = preprocessing.replace_numbers(text, replace_with='') text = preprocessing.remove_accents(text) text = preprocessing.remove_punctuation(text) text = preprocessing.normalize_quotation_marks(text) text = preprocessing.normalize_hyphenated_words(text) text = text.replace('\n', ' ').replace('\t', ' ') text = text.lower() text = preprocessing.normalize_whitespace(text) return text
def textacy_preprocess(sentence): """Preprocess text.""" sentence = preprocessing.normalize_hyphenated_words(sentence) sentence = preprocessing.normalize_quotation_marks(sentence) #sentence = preprocessing.normalize_repeating_chars(sentence) sentence = preprocessing.normalize_unicode(sentence) sentence = preprocessing.normalize_whitespace(sentence) sentence = preprocessing.remove_accents(sentence) sentence = preprocessing.remove_punctuation(sentence) sentence = preprocessing.replace_currency_symbols(sentence) sentence = preprocessing.replace_emails(sentence) sentence = preprocessing.replace_emojis(sentence) sentence = preprocessing.replace_hashtags(sentence) sentence = preprocessing.replace_numbers(sentence) sentence = preprocessing.replace_phone_numbers(sentence) sentence = preprocessing.replace_urls(sentence) sentence = preprocessing.replace_user_handles(sentence) return sentence
def load(path): email_text = extract_email_text(path) if not email_text: return [] # use textacy to do the processing, remove the whitesapace, punctuation email_text = preprocessing.normalize_whitespace( preprocessing.remove_punctuation(email_text)) # remove accents and noralize unicode email_text = preprocessing.normalize_unicode( preprocessing.remove_accents(email_text)) # Tokenize the message tokens = to_tokenized_text(email_text) # Remove stopwords and stem tokens if len(tokens) > 2: # extract stemming word return [w.lemma_ for w in tokens if w not in nlp.Defaults.stopwords] return []
def preprocess_text(text, char_count_filter=True, stopwords=None, min_len=2, max_len=15): """ Pre-processing steps prior to spaCy nlp pipeline. Optional filtering of tokens based on character length. Parameters ---------- text : str char_count_filter : bool stopwords : iterable, None min_len : int max_len : int Returns ------- text : str pre-processed text """ # 1) convert to lower case for robust stop-word recognition text = text.lower() # 2) normalise text = preprocessing.normalize_quotation_marks(text) # text = preprocessing.normalize_repeating_chars(text) text = preprocessing.normalize_hyphenated_words(text) text = preprocessing.normalize_whitespace(text) # 3) replace text = preprocessing.replace_currency_symbols(text) text = preprocessing.replace_emails(text) text = preprocessing.replace_emojis(text) text = preprocessing.replace_hashtags(text) text = preprocessing.replace_numbers(text) text = preprocessing.replace_phone_numbers(text) text = preprocessing.replace_urls(text) text = preprocessing.replace_user_handles(text) # 4) remove text = preprocessing.remove_accents(text) text = preprocessing.remove_punctuation(text) text = re.sub("[^A-Za-z0-9]+", " ", text) # keep text and numbers # 5) optionally remove tokens based on length if char_count_filter & (stopwords is not None): # filter based on token length tokens = gensim.utils.simple_preprocess(doc=text, min_len=min_len, max_len=max_len) # filter case-specific words tokens = [token for token in tokens if token not in stopwords] # convert processed list of tokens back to one string text = " ".join(tokens) else: raise NotImplementedError("Not implemented.") return text
def clean_tweet(self, text): # FIXED UNICODE # text = preprocess.fix_bad_unicode(text) text = ftfy.fix_text(text) # GET TEXT ONLY FROM HTML text = BeautifulSoup(text, features='lxml').getText() # UN-PACK CONTRACTIONS text = preprocess.unpack_contractions(text) # REMOVE URL # text = preprocess.replace_urls(text) text = preprocessing.replace_urls(text) # REMOVE EMAILS # text = preprocess.replace_emails(text) text = preprocessing.replace_emails(text) # REMOVE PHONE NUMBERS # text = preprocess.replace_phone_numbers(text) text = preprocessing.replace_phone_numbers(text) # REMOVE NUMBERS # text = preprocess.replace_numbers(text) text = preprocessing.replace_numbers(text) # REMOVE CURRENCY # text = preprocess.replace_currency_symbols(text) text = preprocessing.replace_currency_symbols(text) # REMOVE ACCENTS # text = preprocess.remove_accents(text) text = preprocessing.remove_accents(text) # CONVERT EMOJIS TO TEXT words = text.split() reformed = [ self.SMILEY[word] if word in self.SMILEY else word for word in words ] text = " ".join(reformed) text = emoji.demojize(text) text = text.replace(":", " ") text = ' '.join(text.split()) # SPLIT ATTACHED WORDS text = ' '.join(re.findall('[A-Z][^A-Z]*', text)) # SPLIT UNDERSCORE WORDS text = text.replace('_', ' ') # REMOVE PUNCTUATION # text = preprocess.remove_punct(text) text = preprocessing.remove_punctuation(text) # Remove numbers text = re.sub(r'\d', '', text) # REMOVE WORDS LESS THAN 3 CHARACTERS text = re.sub(r'\b\w{1,2}\b', '', text) # NORMALIZE WHITESPACE # text = preprocess.normalize_whitespace(text) text = preprocessing.normalize_whitespace(text) return text