예제 #1
0
def preprocess_text(text, remove_stopwords=True):
    global tokenizer
    text = text.lower().strip()
    tokens = tokenizer(text)
    text = [x.text for x in tokens]
    
    # Optionally, remove stop words
    if remove_stopwords:
#         stops = set(stopwords.words("english"))
        own_stopword = ['the','on','a','an','it','be','has','some','my','me', 'i']
        stops = own_stopword
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    if text[-1] in ['.','?']:
        text = text[:-1] + ' ' + text[-1]
        
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,:!.\/'+=@-]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r" n ", " and ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    # text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    # text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    # text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    # text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = re.sub(' +', ' ', text)
    
    return text
예제 #2
0
def _spacy_tokenize(x, spacy):
    return [tok.text for tok in spacy.tokenizer(x)]
예제 #3
0
 def tokenize_text(text):
     return [token.text for token in spacy.tokenizer(text)]
예제 #4
0
def spacy_tokenize(x):
    return [
        tok.text
        for tok in spacy.tokenizer(x)
        if not tok.is_punct | tok.is_space
    ]
def spacyTokenize(spacy, lines):
    lines_new = []
    for sent in lines:
        sent_new = ' '.join([tok.text for tok in spacy.tokenizer(sent)])
        lines_new.append(sent_new)
    return lines_new
예제 #6
0
파일: utils.py 프로젝트: melissayu01/CS287r
def tokenize(text, spacy):
    return [tok.text for tok in spacy.tokenizer(text)]