예제 #1
0
sentence = nlp("We will go to movie after the dinner")
print(sentence)

notStopWords = [
    notStopWords.text for notStopWords in sentence if not notStopWords.is_stop
]
print(notStopWords)

stopWords = [stopWords.text for stopWords in sentence if stopWords.is_stop]
print(stopWords)

#Add & Remove a new Stop Word
import nltk
STOP_WORDS = nltk.corpus.stopwords.words('english')
STOP_WORDS.append('Test')

print(len(STOP_WORDS))
print(STOP_WORDS)

import nltk

STOP_WORDS.remove('Test')

print(len(STOP_WORDS))
print(STOP_WORDS)

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS.add("Test")
from bs4 import BeautifulSoup
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
from nltk.stem import WordNetLemmatizer
from textacy.preprocess import preprocess_text, replace_numbers, replace_phone_numbers, replace_urls
from gensim.utils import to_utf8, tokenize
from gensim.models.phrases import Phrases, Phraser

STOP_WORDS = list(STOP_WORDS)
STOP_WORDS.append('http')
STOP_WORDS.append('www')

def strip_html(text):
    """Remove HTML characters, if any"""
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def clean_text(text):
    text = text.replace('/n', ' ')).replace('.com', ' ').replace('.org', ' ').replace('.net', ' ')
    text = strip_html(text)
    # Remove contractions, if any:
    text = preprocess_text(text, fix_unicode=True, no_accents=True, no_contractions=True, lowercase=True, no_punct=True, no_currency_symbols=True), replace_with=' ')
    text = replace_urls(text, replace_with='')
    text = replace_numbers(text, replace_with='')
    return text

def tokenize_text(text):
    text = clean_text(text)
    return list(tokenize(text))