예제 #1
0
def clean_component(review, contract_model, stop_words, tokenizer, puncts):
    """Text Cleaner: Expand Contractions, Tokenize, Remove Stopwords, Punctuation, Lemmatize, Spell Correct, Lowercase"""

    rev_contract_exp = list(contract_model.expand_texts([review],
                                                        precise=True))

    doc_tok = tokenizer(rev_contract_exp[0])

    doc_lems = [
        tok.lemma_ for tok in doc_tok
        if (tok.text not in stop_words and tok.text not in puncts
            and tok.pos_ != "PUNCT" and tok.pos_ != "SYM")
    ]

    lem_list = [
        re.search(r'\(?([0-9A-Za-z-]+)\)?', tok).group(1)
        if '-' in tok else spell(remove_punct(tok)) for tok in doc_lems
    ]

    doc2vec_input = [
        t.lower() for tok in lem_list for t in tok.split()
        if t.lower() not in stop_words
    ]

    return doc2vec_input
예제 #2
0
    def clean_tweet(self, text):
        # FIXED UNICODE
        text = preprocess.fix_bad_unicode(text)

        # GET TEXT ONLY FROM HTML
        text = BeautifulSoup(text, features='lxml').getText()
        # UN-PACK CONTRACTIONS
        text = preprocess.unpack_contractions(text)

        # REMOVE URL
        text = preprocess.replace_urls(text)

        # REMOVE EMAILS
        text = preprocess.replace_emails(text)

        # REMOVE PHONE NUMBERS
        text = preprocess.replace_phone_numbers(text)

        # REMOVE NUMBERS
        text = preprocess.replace_numbers(text)

        # REMOVE CURRENCY
        text = preprocess.replace_currency_symbols(text)

        # REMOVE ACCENTS
        text = preprocess.remove_accents(text)

        # CONVERT EMOJIS TO TEXT
        words = text.split()
        reformed = [
            self.SMILEY[word] if word in self.SMILEY else word
            for word in words
        ]
        text = " ".join(reformed)
        text = emoji.demojize(text)
        text = text.replace(":", " ")
        text = ' '.join(text.split())

        # SPLIT ATTACHED WORDS
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text))

        # SPLIT UNDERSCORE WORDS
        text = text.replace('_', ' ')

        # REMOVE PUNCTUATION
        text = preprocess.remove_punct(text)

        # Remove numbers
        text = re.sub(r'\d', '', text)

        # REMOVE WORDS LESS THAN 3 CHARACTERS
        text = re.sub(r'\b\w{1,2}\b', '', text)

        # NORMALIZE WHITESPACE
        text = preprocess.normalize_whitespace(text)

        return text
def load_stopwords():
    stopwords = []
    for filename in glob.glob('stopwords/*.txt'):
        with open(filename) as fileobj:
            for line in fileobj:
                line = preprocess_unicode(line.decode('utf8').strip())
                line = preprocess.remove_punct(line)
                if line:
                    stopwords.append(line)

    return stopwords + [word.decode('utf8') for word in STOPWORDS]
예제 #4
0
 def clean_text(self, raw_text):
     raw_text = self.strip_tags(raw_text)
     raw_text = raw_text.lower()
     raw_text = preprocess.remove_punct(raw_text)
     raw_text = preprocess.transliterate_unicode(raw_text)
     raw_text = preprocess.replace_urls(raw_text, replace_with='')
     raw_text = preprocess.replace_emails(raw_text, replace_with='')
     raw_text = preprocess.replace_phone_numbers(raw_text, replace_with='')
     raw_text = preprocess.replace_numbers(raw_text, replace_with='')
     raw_text = preprocess.replace_currency_symbols(raw_text,
                                                    replace_with='')
     return raw_text
예제 #5
0
def test_remove_punct_marks():
    text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience."
    proc_text = "I can t. No, I won t! It s a matter of  principle ; of   what s the word?   conscience."
    assert preprocess.remove_punct(text, marks="-'\"") == proc_text
예제 #6
0
def test_remove_punct():
    text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience."
    proc_text = "I can t  No  I won t  It s a matter of  principle   of    what s the word     conscience "
    assert preprocess.remove_punct(text) == proc_text
예제 #7
0
 def test_remove_punct(self):
     text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience."
     proc_text = "I cant No I wont Its a matter of principle of  whats the word  conscience"
     self.assertEqual(preprocess.remove_punct(text), proc_text)