示例#1
0
def test_remove_accents():
    in_outs = [
        ("El niño se asustó del pingüino -- qué miedo!",
         "El nino se asusto del pinguino -- que miedo!"),
        ("Le garçon est très excité pour la forêt.",
         "Le garcon est tres excite pour la foret."),
    ]
    for in_, out_ in in_outs:
        assert preprocessing.remove_accents(in_, fast=False) == out_
        assert preprocessing.remove_accents(in_, fast=True) == out_
示例#2
0
def preprocess_sentence(sent, lower=True):
    """Pre-process a sentence ( via ``textacy.preprocess` module ).

    Args:
        sent (str): text.
        lower (bool): whether to return a lowercase string.

    Returns:
        str
    """
    # normalize unicode
    sent = preprocessing.normalize_unicode(sent)

    # deaccent
    sent = preprocessing.remove_accents(sent)

    # replace newline chars
    sent = re.sub("\n|\r", " ", sent)

    # unpack contractions
    sent = contractions.fix(sent)

    # replace emoji symbols
    sent = preprocessing.replace_emojis(sent)

    # replace hashtags
    sent = preprocessing.replace_hashtags(sent)

    # replace user handles
    sent = preprocessing.replace_user_handles(sent)

    # replace currency symbols
    sent = preprocessing.replace_currency_symbols(sent)

    # replace emails
    sent = preprocessing.replace_emails(sent)

    # replace URLs
    sent = preprocessing.replace_urls(sent)

    # remove punctuation
    sent = preprocessing.remove_punctuation(sent)

    # normalize whitespace
    sent = preprocessing.normalize_whitespace(sent)

    if lower:
        sent = sent.lower()
    return sent
示例#3
0
def text_cleanup(text):
    "cleanup our text"

    text = preprocessing.replace_emails(text, replace_with='')
    text = preprocessing.replace_urls(text, replace_with='')
    text = preprocessing.replace_hashtags(text, replace_with='')
    text = preprocessing.replace_phone_numbers(text, replace_with='')
    text = preprocessing.replace_numbers(text, replace_with='')

    text = preprocessing.remove_accents(text)
    text = preprocessing.remove_punctuation(text)

    text = preprocessing.normalize_quotation_marks(text)
    text = preprocessing.normalize_hyphenated_words(text)
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = text.lower()

    text = preprocessing.normalize_whitespace(text)
    return text
示例#4
0
def textacy_preprocess(sentence):
    """Preprocess text."""
    sentence = preprocessing.normalize_hyphenated_words(sentence)
    sentence = preprocessing.normalize_quotation_marks(sentence)
    #sentence = preprocessing.normalize_repeating_chars(sentence)
    sentence = preprocessing.normalize_unicode(sentence)
    sentence = preprocessing.normalize_whitespace(sentence)
    sentence = preprocessing.remove_accents(sentence)
    sentence = preprocessing.remove_punctuation(sentence)
    sentence = preprocessing.replace_currency_symbols(sentence)
    sentence = preprocessing.replace_emails(sentence)
    sentence = preprocessing.replace_emojis(sentence)
    sentence = preprocessing.replace_hashtags(sentence)
    sentence = preprocessing.replace_numbers(sentence)
    sentence = preprocessing.replace_phone_numbers(sentence)
    sentence = preprocessing.replace_urls(sentence)
    sentence = preprocessing.replace_user_handles(sentence)

    return sentence
def load(path):
    email_text = extract_email_text(path)
    if not email_text:
        return []

    # use textacy to do the processing, remove the whitesapace, punctuation
    email_text = preprocessing.normalize_whitespace(
        preprocessing.remove_punctuation(email_text))
    # remove accents and noralize unicode
    email_text = preprocessing.normalize_unicode(
        preprocessing.remove_accents(email_text))

    # Tokenize the message
    tokens = to_tokenized_text(email_text)

    # Remove stopwords and stem tokens
    if len(tokens) > 2:
        # extract stemming word
        return [w.lemma_ for w in tokens if w not in nlp.Defaults.stopwords]
    return []
示例#6
0
def preprocess_text(text,
                    char_count_filter=True,
                    stopwords=None,
                    min_len=2,
                    max_len=15):
    """
    Pre-processing steps prior to spaCy nlp pipeline. Optional filtering of
    tokens based on character length.

    Parameters
    ----------
    text : str
    char_count_filter : bool
    stopwords : iterable, None
    min_len : int
    max_len : int

    Returns
    -------
    text : str
        pre-processed text
    """
    # 1) convert to lower case for robust stop-word recognition
    text = text.lower()

    # 2) normalise
    text = preprocessing.normalize_quotation_marks(text)
    # text = preprocessing.normalize_repeating_chars(text)
    text = preprocessing.normalize_hyphenated_words(text)
    text = preprocessing.normalize_whitespace(text)

    # 3) replace
    text = preprocessing.replace_currency_symbols(text)
    text = preprocessing.replace_emails(text)
    text = preprocessing.replace_emojis(text)
    text = preprocessing.replace_hashtags(text)
    text = preprocessing.replace_numbers(text)
    text = preprocessing.replace_phone_numbers(text)
    text = preprocessing.replace_urls(text)
    text = preprocessing.replace_user_handles(text)

    # 4) remove
    text = preprocessing.remove_accents(text)
    text = preprocessing.remove_punctuation(text)
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # keep text and numbers

    # 5) optionally remove tokens based on length
    if char_count_filter & (stopwords is not None):
        # filter based on token length
        tokens = gensim.utils.simple_preprocess(doc=text,
                                                min_len=min_len,
                                                max_len=max_len)
        # filter case-specific words
        tokens = [token for token in tokens if token not in stopwords]

        # convert processed list of tokens back to one string
        text = " ".join(tokens)
    else:
        raise NotImplementedError("Not implemented.")

    return text
    def clean_tweet(self, text):
        # FIXED UNICODE
        # text = preprocess.fix_bad_unicode(text)
        text = ftfy.fix_text(text)

        # GET TEXT ONLY FROM HTML
        text = BeautifulSoup(text, features='lxml').getText()

        # UN-PACK CONTRACTIONS
        text = preprocess.unpack_contractions(text)

        # REMOVE URL
        # text = preprocess.replace_urls(text)
        text = preprocessing.replace_urls(text)

        # REMOVE EMAILS
        # text = preprocess.replace_emails(text)
        text = preprocessing.replace_emails(text)

        # REMOVE PHONE NUMBERS
        # text = preprocess.replace_phone_numbers(text)
        text = preprocessing.replace_phone_numbers(text)

        # REMOVE NUMBERS
        # text = preprocess.replace_numbers(text)
        text = preprocessing.replace_numbers(text)

        # REMOVE CURRENCY
        # text = preprocess.replace_currency_symbols(text)
        text = preprocessing.replace_currency_symbols(text)

        # REMOVE ACCENTS
        # text = preprocess.remove_accents(text)
        text = preprocessing.remove_accents(text)

        # CONVERT EMOJIS TO TEXT
        words = text.split()
        reformed = [
            self.SMILEY[word] if word in self.SMILEY else word
            for word in words
        ]
        text = " ".join(reformed)
        text = emoji.demojize(text)
        text = text.replace(":", " ")
        text = ' '.join(text.split())

        # SPLIT ATTACHED WORDS
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text))

        # SPLIT UNDERSCORE WORDS
        text = text.replace('_', ' ')

        # REMOVE PUNCTUATION
        # text = preprocess.remove_punct(text)
        text = preprocessing.remove_punctuation(text)

        # Remove numbers
        text = re.sub(r'\d', '', text)

        # REMOVE WORDS LESS THAN 3 CHARACTERS
        text = re.sub(r'\b\w{1,2}\b', '', text)

        # NORMALIZE WHITESPACE
        # text = preprocess.normalize_whitespace(text)
        text = preprocessing.normalize_whitespace(text)

        return text