Пример #1
0
def preprocess_data(line, token_pattern=token_pattern, encode_digit=False):
    token_pattern = re.compile(token_pattern, flags=re.UNICODE | re.LOCALE)
    # tokenize
    tokens = [x.lower() for x in token_pattern.findall(line)]
    # stem
    tokens_stemmed = stem_tokens(tokens, english_stemmer)

    return tokens_stemmed
def preprocess_data(line):
    # tokenize
    tokens = token_pattern.findall(line)
    # stem
    tokens_stemmed = nlp_utils.stem_tokens(tokens, nlp_utils.english_stemmer)
    # Stop words removal
    tokens_stemmed = [x for x in tokens_stemmed if x not in nlp_utils.stopwords]
    return tokens_stemmed
Пример #3
0
def imojify_input(line, src_lang="en"):
	line = line.lower()
	sents = nlp_utils.tokenize(line)
	imojified = []

	for s in sents:
		imojified.append(imojify_sentence(nlp_utils.stem_tokens(s, src_lang),
						 src_lang))
	return imojified
Пример #4
0
def preprocess_data(line,
                    token_pattern=token_pattern,
                    exclude_stopword=config.cooccurrence_word_exclude_stopword,
                    encode_digit=False):
    token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
    tokens = [x.lower() for x in token_pattern.findall(line)]
    tokens_stemmed = stem_tokens(tokens, english_stemmer)
    if exclude_stopword:
        tokens_stemmed = [x for x in tokens_stemmed if x not in stopwords]
    return tokens_stemmed