def clean_text(self, text): """ normalizes the text :param text: a string of text, to be cleaned. :return: a list of terms (i.e. tokenized) """ if text: text = text.lower() text = text.split() cleaned = [] cleaner_pipeline = TermPipeline() cleaner_pipeline = self.construct_pipeline(cleaner_pipeline) for term in text: clean_result = cleaner_pipeline.process(term) if clean_result: cleaned.append(clean_result) return cleaned else: return ''
def setUp(self): self.logger = logging.getLogger("TestTermPipeline") self.ltp = LengthTermProcessor() self.tp = TermProcessor() self.stp = StopwordTermProcessor(stopwordfile='stopwords_test.txt') self.ptp = PunctuationTermProcessor() self.atp = AlphaTermProcessor() self.sctp = SpecialCharProcessor() self.pipeline = TermPipeline() self.pipeline.add_processor(self.sctp) self.pipeline.add_processor(self.tp) self.pipeline.add_processor(self.ltp) self.pipeline.add_processor(self.ptp) self.pipeline.add_processor(self.stp) self.pipeline.add_processor(self.atp)