def check_plain_corpus(base_fname): """Attempts to make sure the plain-text corpus is available.""" try: with open(base_fname + '.txt') as wiki_dump: pages = plain_page_generator(wiki_dump) if not next(pages): raise IndexLoadError except IOError: raise IndexLoadError
def create_index(fname, progress_count=None, max_pages=None): """Processes a corpus to create a corresponding Index object.""" logger = logging.getLogger('create_index') if sent_detector is None: create_punkt_sent_detector(fname=fname, progress_count=CHUNK_SIZE, max_pages=min(25000, max_pages)) # Set params if progress_count is None: progress_count = CHUNK_SIZE * NUMBER_OF_PROCESSES # First pass, create Dictionary and plain-text version of corpus. try: dictionary = (gensim.corpora.dictionary.Dictionary(). load_from_text(fname + '.dict')) if not dictionary or check_plain_corpus(fname): raise IndexLoadError except (IOError, IndexLoadError): first_pass(fname, progress_count, max_pages) else: del dictionary # Page task queues for parallel processing taskq = multiprocessing.Queue(MAX_QUEUE_ITEMS) doneq = multiprocessing.Queue(MAX_QUEUE_ITEMS) # Start worker processes logger.info('Starting workers') workers = [] for i in range(NUMBER_OF_PROCESSES): p = multiprocessing.Process(target=second_pass_worker, args=(taskq, doneq)) p.start() workers.append(p) # Start log writer process p = multiprocessing.Process(target=second_pass_writer, args=(doneq, fname)) p.start() workers.append(p) # We are now working with the plain-text corpus generated in the 1st pass. fname += '.txt' wiki_size = os.path.getsize(fname) # Process XML dump logger.info('Begining plain-text parse') wiki_size = os.path.getsize(fname) page_count = 0 task_buff = [] try: with open(fname, mode='rb') as wiki_dump: pages = plain_page_generator(wiki_dump) for page in pages: task_buff.append(page) if len(task_buff) == CHUNK_SIZE: taskq.put(task_buff) task_buff = [] page_count += 1 if page_count == max_pages: break if page_count % progress_count == 0: print(page_count, page.start, (page.start / wiki_size * 100), taskq.qsize(), doneq.qsize(), page.ID, page.title) except KeyboardInterrupt: print 'KeyboardInterrupt: Stopping the reading of the dump early!' finally: # Flush task buffer taskq.put(task_buff) task_buff = [] # Tell child processes to stop for i in range(NUMBER_OF_PROCESSES): taskq.put(None) logger.info('All done! Processed %s total pages.', page_count) # Wait for all child processes to stop (especially that writer!) for p in workers: p.join()
def find_page(start): wiki_dump.seek(start) pages = plain_page_generator(wiki_dump) return next(pages)