Пример #1
0
    def clean_text(self, text):
        """ normalizes the text
        :param text: a string of text, to be cleaned.
        :return: a list of terms (i.e. tokenized)
        """
        if text:
            text = text.lower()
            text = text.split()
            cleaned = []
            cleaner_pipeline = TermPipeline()
            cleaner_pipeline = self.construct_pipeline(cleaner_pipeline)

            for term in text:
                clean_result = cleaner_pipeline.process(term)
                if clean_result:
                    cleaned.append(clean_result)
            return cleaned
        else:
            return ''
Пример #2
0
    def setUp(self):
        self.logger = logging.getLogger("TestTermPipeline")

        self.ltp = LengthTermProcessor()
        self.tp = TermProcessor()
        self.stp = StopwordTermProcessor(stopwordfile='stopwords_test.txt')
        self.ptp = PunctuationTermProcessor()
        self.atp = AlphaTermProcessor()
        self.sctp = SpecialCharProcessor()



        self.pipeline = TermPipeline()
        self.pipeline.add_processor(self.sctp)
        self.pipeline.add_processor(self.tp)
        self.pipeline.add_processor(self.ltp)
        self.pipeline.add_processor(self.ptp)
        self.pipeline.add_processor(self.stp)
        self.pipeline.add_processor(self.atp)