def clean_text(self,minWordLen): dictionary = Dictionary.Dictionary2() for i in range(len(self.inputContent)): line = self.inputContent[i].rstrip('\n').split('\t') text = " ".join(line[self.columnStart:]).strip() text = Filters.filter_url( text ) text = Filters.filter_accents(text.decode('utf8', 'ignore')) text = Filters.filter_punct( text ) text = Filters.filter_charRepetition( text ).split() words = [ word for word in text if word.find('@') == -1 and not word.isdigit() \ and len(word) > minWordLen] newLine = "\t".join( line[:self.columnStart] ) + "\t" for word in words: if word[0].isupper(): newWord = word else: newWord = dictionary.getWord(word, False, False) if( word not in self.histogram ): self.histogram[word] = 0 self.histogram[word] += 1 newLine += newWord + " " self.inputContent[i] = newLine.strip()