Exemplo n.º 1
0
 def process(self, text, doc_id):
     '''
     calls on the TextProcessor class for text processing and tokenization of documents.
     each url is designated with unique id number.
     id is increased at the end for new url.
     returns updated index.
     '''
     processor = TextProcessor(text, self.index, doc_id)
     processor.process_text()
     return processor.get_index()
Exemplo n.º 2
0
    def run(self):
        logger.info('Creating text processor')
        text_processor = TextProcessor()

        for file in self.input().keys():
            logger.info('Reading %s file: "%s"', file, self.input()[file].path)
            df = pd.read_csv(self.input()[file].path)

            logger.info('Its %s lines', df.shape[0])
            logger.info('Start processing %s...', file)

            df.name = df.name.map(lambda x: text_processor.process_text(x, lang='ru'))
            df.name = df.name.map(lambda x: ' '.join(x))

            logger.info('Processing of %s succeed, writing it to "%s"', file, self.output()[file].path)

            df.to_csv(self.output()[file].path)
Exemplo n.º 3
0
if __name__ == "__main__":
    pp = pprint.PrettyPrinter(indent=4, depth=2)

    # Initialize classifier
    classifier = NaiveBayes()

    # Train
    for f in find("data/1/training"):
        f = f.strip()
        if not f.endswith(".txt"):
            continue

        with open(f) as doc:
            text = doc.read()

        sentences = nlp.process_text(text)

        label = "movie" if "movie" in f else "play"

        classifier.train(sentences, label=label)

    # Test
    for f in find("data/1/testing"):
        f = f.strip()
        if not f.endswith(".txt"):
            continue

        with open(f) as doc:
            text = doc.read()

        sentences = nlp.process_text(text)