示例#1
0
import time
import re
from os import listdir, getcwd, path

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from data_loader import load_filenames, filter_feats
from db_manager import DBManager
from config import DIR_FILES, DIR_CLEANED_FILES
from file_utils import create_dir


manager = DBManager()
manager.create()
stemmer = SnowballStemmer("english")
VERBOSE = True


def clean_word(w):
    """
    Delete all the redundant information encoded in a word.
    :param w: word to be cleaned
    :return: cleaned word
    """
    if w in [".", "?", "!"]: # will be needed for ngrams to find out where a sentence finishes.
        return w
    w = w.lower()
    w = re.sub('[^A-Za-z]+', '', w)
    if w in stopwords.words('english'):