Python TextProcessor.process_text示例

 def process(self, text, doc_id):
     '''
     calls on the TextProcessor class for text processing and tokenization of documents.
     each url is designated with unique id number.
     id is increased at the end for new url.
     returns updated index.
     '''
     processor = TextProcessor(text, self.index, doc_id)
     processor.process_text()
     return processor.get_index()

示例#2

显示文件

文件： prepare_data.py 项目： ShT3ch/luigi_workshop

    def run(self):
        logger.info('Creating text processor')
        text_processor = TextProcessor()

        for file in self.input().keys():
            logger.info('Reading %s file: "%s"', file, self.input()[file].path)
            df = pd.read_csv(self.input()[file].path)

            logger.info('Its %s lines', df.shape[0])
            logger.info('Start processing %s...', file)

            df.name = df.name.map(lambda x: text_processor.process_text(x, lang='ru'))
            df.name = df.name.map(lambda x: ' '.join(x))

            logger.info('Processing of %s succeed, writing it to "%s"', file, self.output()[file].path)

            df.to_csv(self.output()[file].path)

示例#3

显示文件

if __name__ == "__main__":
    pp = pprint.PrettyPrinter(indent=4, depth=2)

    # Initialize classifier
    classifier = NaiveBayes()

    # Train
    for f in find("data/1/training"):
        f = f.strip()
        if not f.endswith(".txt"):
            continue

        with open(f) as doc:
            text = doc.read()

        sentences = nlp.process_text(text)

        label = "movie" if "movie" in f else "play"

        classifier.train(sentences, label=label)

    # Test
    for f in find("data/1/testing"):
        f = f.strip()
        if not f.endswith(".txt"):
            continue

        with open(f) as doc:
            text = doc.read()

        sentences = nlp.process_text(text)