Пример #1
0
def check_plain_corpus(base_fname):
    """Attempts to make sure the plain-text corpus is available."""
    try:
        with open(base_fname + '.txt') as wiki_dump:
            pages = plain_page_generator(wiki_dump)
            if not next(pages):
                raise IndexLoadError
    except IOError:
        raise IndexLoadError
Пример #2
0
def check_plain_corpus(base_fname):
    """Attempts to make sure the plain-text corpus is available."""
    try:
        with open(base_fname + '.txt') as wiki_dump:
            pages = plain_page_generator(wiki_dump)
            if not next(pages):
                raise IndexLoadError
    except IOError:
        raise IndexLoadError
Пример #3
0
def create_index(fname, progress_count=None, max_pages=None):
    """Processes a corpus to create a corresponding Index object."""
    logger = logging.getLogger('create_index')

    if sent_detector is None:
        create_punkt_sent_detector(fname=fname,
                                   progress_count=CHUNK_SIZE,
                                   max_pages=min(25000, max_pages))

    # Set params
    if progress_count is None:
        progress_count = CHUNK_SIZE * NUMBER_OF_PROCESSES

    # First pass, create Dictionary and plain-text version of corpus.
    try:
        dictionary = (gensim.corpora.dictionary.Dictionary().
                      load_from_text(fname + '.dict'))
        if not dictionary or check_plain_corpus(fname):
            raise IndexLoadError
    except (IOError, IndexLoadError):
        first_pass(fname, progress_count, max_pages)
    else:
        del dictionary

    # Page task queues for parallel processing
    taskq = multiprocessing.Queue(MAX_QUEUE_ITEMS)
    doneq = multiprocessing.Queue(MAX_QUEUE_ITEMS)

    # Start worker processes
    logger.info('Starting workers')
    workers = []
    for i in range(NUMBER_OF_PROCESSES):
        p = multiprocessing.Process(target=second_pass_worker,
                                    args=(taskq, doneq))
        p.start()
        workers.append(p)

    # Start log writer process
    p = multiprocessing.Process(target=second_pass_writer, args=(doneq, fname))
    p.start()
    workers.append(p)

    # We are now working with the plain-text corpus generated in the 1st pass.
    fname += '.txt'

    wiki_size = os.path.getsize(fname)

    # Process XML dump
    logger.info('Begining plain-text parse')

    wiki_size = os.path.getsize(fname)
    page_count = 0

    task_buff = []
    try:
        with open(fname, mode='rb') as wiki_dump:
            pages = plain_page_generator(wiki_dump)
            for page in pages:
                task_buff.append(page)
                if len(task_buff) == CHUNK_SIZE:
                    taskq.put(task_buff)
                    task_buff = []
                page_count += 1
                if page_count == max_pages:
                    break
                if page_count % progress_count == 0:
                    print(page_count, page.start,
                          (page.start / wiki_size * 100),
                          taskq.qsize(), doneq.qsize(),
                          page.ID, page.title)
    except KeyboardInterrupt:
        print 'KeyboardInterrupt: Stopping the reading of the dump early!'
    finally:
        # Flush task buffer
        taskq.put(task_buff)
        task_buff = []
        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            taskq.put(None)

    logger.info('All done! Processed %s total pages.', page_count)

    # Wait for all child processes to stop (especially that writer!)
    for p in workers:
        p.join()
Пример #4
0
 def find_page(start):
     wiki_dump.seek(start)
     pages = plain_page_generator(wiki_dump)
     return next(pages)
Пример #5
0
def create_index(fname, progress_count=None, max_pages=None):
    """Processes a corpus to create a corresponding Index object."""
    logger = logging.getLogger('create_index')

    if sent_detector is None:
        create_punkt_sent_detector(fname=fname,
                                   progress_count=CHUNK_SIZE,
                                   max_pages=min(25000, max_pages))

    # Set params
    if progress_count is None:
        progress_count = CHUNK_SIZE * NUMBER_OF_PROCESSES

    # First pass, create Dictionary and plain-text version of corpus.
    try:
        dictionary = (gensim.corpora.dictionary.Dictionary().
                      load_from_text(fname + '.dict'))
        if not dictionary or check_plain_corpus(fname):
            raise IndexLoadError
    except (IOError, IndexLoadError):
        first_pass(fname, progress_count, max_pages)
    else:
        del dictionary

    # Page task queues for parallel processing
    taskq = multiprocessing.Queue(MAX_QUEUE_ITEMS)
    doneq = multiprocessing.Queue(MAX_QUEUE_ITEMS)

    # Start worker processes
    logger.info('Starting workers')
    workers = []
    for i in range(NUMBER_OF_PROCESSES):
        p = multiprocessing.Process(target=second_pass_worker,
                                    args=(taskq, doneq))
        p.start()
        workers.append(p)

    # Start log writer process
    p = multiprocessing.Process(target=second_pass_writer, args=(doneq, fname))
    p.start()
    workers.append(p)

    # We are now working with the plain-text corpus generated in the 1st pass.
    fname += '.txt'

    wiki_size = os.path.getsize(fname)

    # Process XML dump
    logger.info('Begining plain-text parse')

    wiki_size = os.path.getsize(fname)
    page_count = 0

    task_buff = []
    try:
        with open(fname, mode='rb') as wiki_dump:
            pages = plain_page_generator(wiki_dump)
            for page in pages:
                task_buff.append(page)
                if len(task_buff) == CHUNK_SIZE:
                    taskq.put(task_buff)
                    task_buff = []
                page_count += 1
                if page_count == max_pages:
                    break
                if page_count % progress_count == 0:
                    print(page_count, page.start,
                          (page.start / wiki_size * 100),
                          taskq.qsize(), doneq.qsize(),
                          page.ID, page.title)
    except KeyboardInterrupt:
        print 'KeyboardInterrupt: Stopping the reading of the dump early!'
    finally:
        # Flush task buffer
        taskq.put(task_buff)
        task_buff = []
        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            taskq.put(None)

    logger.info('All done! Processed %s total pages.', page_count)

    # Wait for all child processes to stop (especially that writer!)
    for p in workers:
        p.join()
Пример #6
0
 def find_page(start):
     wiki_dump.seek(start)
     pages = plain_page_generator(wiki_dump)
     return next(pages)