def create_punkt_sent_detector(fname, progress_count, max_pages=25000): """Makes a pass through the corpus to train a Punkt sentence segmenter.""" logger = logging.getLogger('create_punkt_sent_detector') punkt = nltk.tokenize.punkt.PunktTrainer() logger.info("Training punkt sentence detector") wiki_size = os.path.getsize(fname) page_count = 0 try: with open(fname, mode='rb') as wiki_dump: pages = page_generator(wiki_dump) for page in pages: page.preprocess() punkt.train(page.text, finalize=False, verbose=False) page_count += 1 if page_count == max_pages: break if page_count % progress_count == 0: print( page_count, page.start, (page.start / wiki_size * 100), # taskq.qsize() if taskq is not None else 'n/a', # doneq.qsize() if doneq is not None else 'n/a', page.ID, page.title) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' logger.info('Now finalzing Punkt training.') punkt.finalize_training(verbose=True) learned = punkt.get_params() sbd = nltk.tokenize.punkt.PunktSentenceTokenizer(learned) with open(PUNKT_FNAME, mode='wb') as f: pickle.dump(sbd, f, protocol=pickle.HIGHEST_PROTOCOL)
def create_punkt_sent_detector(fname, progress_count, max_pages=25000): """Makes a pass through the corpus to train a Punkt sentence segmenter.""" logger = logging.getLogger('create_punkt_sent_detector') punkt = nltk.tokenize.punkt.PunktTrainer() logger.info("Training punkt sentence detector") wiki_size = os.path.getsize(fname) page_count = 0 try: with open(fname, mode='rb') as wiki_dump: pages = page_generator(wiki_dump) for page in pages: page.preprocess() punkt.train(page.text, finalize=False, verbose=False) page_count += 1 if page_count == max_pages: break if page_count % progress_count == 0: print(page_count, page.start, (page.start / wiki_size * 100), # taskq.qsize() if taskq is not None else 'n/a', # doneq.qsize() if doneq is not None else 'n/a', page.ID, page.title) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' logger.info('Now finalzing Punkt training.') punkt.finalize_training(verbose=True) learned = punkt.get_params() sbd = nltk.tokenize.punkt.PunktSentenceTokenizer(learned) with open(PUNKT_FNAME, mode='wb') as f: pickle.dump(sbd, f, protocol=pickle.HIGHEST_PROTOCOL)
def first_pass(fname, progress_count=None, max_pages=None): """Extract a Dictionary and create plain-text version of corpus.""" logger = logging.getLogger('first_pass') wiki_size = os.path.getsize(fname) # Page task queues for parallel processing taskq = multiprocessing.Queue(MAX_QUEUE_ITEMS) doneq = multiprocessing.Queue(MAX_QUEUE_ITEMS) # Start worker processes logger.info('Starting workers') workers = [] for i in range(NUMBER_OF_PROCESSES): p = multiprocessing.Process(target=first_pass_worker, args=(taskq, doneq)) p.start() workers.append(p) # Start log writer process p = multiprocessing.Process(target=first_pass_writer, args=(doneq, fname)) p.start() workers.append(p) # Process XML dump logger.info('Begining XML parse') wiki_size = os.path.getsize(fname) page_count = 0 task_buff = [] try: with open(fname, mode='rb') as wiki_dump: pages = page_generator(wiki_dump) for page in pages: task_buff.append(page) if len(task_buff) == CHUNK_SIZE: taskq.put(task_buff) task_buff = [] page_count += 1 if page_count == max_pages: break if page_count % progress_count == 0: print(page_count, page.start, (page.start / wiki_size * 100), taskq.qsize(), doneq.qsize(), page.ID, page.title) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' finally: # Flush task buffer taskq.put(task_buff) task_buff = [] # Tell child processes to stop for i in range(NUMBER_OF_PROCESSES): taskq.put(None) logger.info('All done! Processed %s total pages.', page_count) # Wait for all child processes to stop (especially that writer!) for p in workers: p.join()