def _cleanUp(self, line): a = factory.EngLowercase() lines = a.lowercase(line) stopset = set(stopwords.words('english')) | set(string.punctuation) cleanup = " ".join(filter(lambda word: word not in stopset, lines.split())) if (self.remove_digit): cleanup = ''.join(i for i in cleanup if not i.isdigit()) if(self.remove_punctuation): cleanup = cleanup.translate(self.table) return cleanup
def clean_up(line): a = factory.EngLowercase() lines = a.lowercase(line) stopset = set(stopwords.words('english')) | set(string.punctuation) cleanup = " ".join(filter(lambda word: word not in stopset, lines.split())) return cleanup