def process(self, text, doc_id): ''' calls on the TextProcessor class for text processing and tokenization of documents. each url is designated with unique id number. id is increased at the end for new url. returns updated index. ''' processor = TextProcessor(text, self.index, doc_id) processor.process_text() return processor.get_index()
def run(self): logger.info('Creating text processor') text_processor = TextProcessor() for file in self.input().keys(): logger.info('Reading %s file: "%s"', file, self.input()[file].path) df = pd.read_csv(self.input()[file].path) logger.info('Its %s lines', df.shape[0]) logger.info('Start processing %s...', file) df.name = df.name.map(lambda x: text_processor.process_text(x, lang='ru')) df.name = df.name.map(lambda x: ' '.join(x)) logger.info('Processing of %s succeed, writing it to "%s"', file, self.output()[file].path) df.to_csv(self.output()[file].path)
if __name__ == "__main__": pp = pprint.PrettyPrinter(indent=4, depth=2) # Initialize classifier classifier = NaiveBayes() # Train for f in find("data/1/training"): f = f.strip() if not f.endswith(".txt"): continue with open(f) as doc: text = doc.read() sentences = nlp.process_text(text) label = "movie" if "movie" in f else "play" classifier.train(sentences, label=label) # Test for f in find("data/1/testing"): f = f.strip() if not f.endswith(".txt"): continue with open(f) as doc: text = doc.read() sentences = nlp.process_text(text)