import time import re from os import listdir, getcwd, path from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer from data_loader import load_filenames, filter_feats from db_manager import DBManager from config import DIR_FILES, DIR_CLEANED_FILES from file_utils import create_dir manager = DBManager() manager.create() stemmer = SnowballStemmer("english") VERBOSE = True def clean_word(w): """ Delete all the redundant information encoded in a word. :param w: word to be cleaned :return: cleaned word """ if w in [".", "?", "!"]: # will be needed for ngrams to find out where a sentence finishes. return w w = w.lower() w = re.sub('[^A-Za-z]+', '', w) if w in stopwords.words('english'):