def create_punkt_sent_detector(fname, progress_count, max_pages=25000):
    """Makes a pass through the corpus to train a Punkt sentence segmenter."""
    logger = logging.getLogger('create_punkt_sent_detector')

    punkt = nltk.tokenize.punkt.PunktTrainer()

    logger.info("Training punkt sentence detector")

    wiki_size = os.path.getsize(fname)
    page_count = 0

    try:
        with open(fname, mode='rb') as wiki_dump:
            pages = page_generator(wiki_dump)
            for page in pages:
                page.preprocess()
                punkt.train(page.text, finalize=False, verbose=False)
                page_count += 1
                if page_count == max_pages:
                    break
                if page_count % progress_count == 0:
                    print(
                        page_count,
                        page.start,
                        (page.start / wiki_size * 100),
                        # taskq.qsize() if taskq is not None else 'n/a',
                        # doneq.qsize() if doneq is not None else 'n/a',
                        page.ID,
                        page.title)
    except KeyboardInterrupt:
        print 'KeyboardInterrupt: Stopping the reading of the dump early!'

    logger.info('Now finalzing Punkt training.')

    punkt.finalize_training(verbose=True)
    learned = punkt.get_params()
    sbd = nltk.tokenize.punkt.PunktSentenceTokenizer(learned)
    with open(PUNKT_FNAME, mode='wb') as f:
        pickle.dump(sbd, f, protocol=pickle.HIGHEST_PROTOCOL)
示例#2
0
def create_punkt_sent_detector(fname, progress_count, max_pages=25000):
    """Makes a pass through the corpus to train a Punkt sentence segmenter."""
    logger = logging.getLogger('create_punkt_sent_detector')

    punkt = nltk.tokenize.punkt.PunktTrainer()

    logger.info("Training punkt sentence detector")

    wiki_size = os.path.getsize(fname)
    page_count = 0

    try:
        with open(fname, mode='rb') as wiki_dump:
            pages = page_generator(wiki_dump)
            for page in pages:
                page.preprocess()
                punkt.train(page.text, finalize=False, verbose=False)
                page_count += 1
                if page_count == max_pages:
                    break
                if page_count % progress_count == 0:
                    print(page_count, page.start,
                          (page.start / wiki_size * 100),
                          # taskq.qsize() if taskq is not None else 'n/a',
                          # doneq.qsize() if doneq is not None else 'n/a',
                          page.ID, page.title)
    except KeyboardInterrupt:
        print 'KeyboardInterrupt: Stopping the reading of the dump early!'

    logger.info('Now finalzing Punkt training.')

    punkt.finalize_training(verbose=True)
    learned = punkt.get_params()
    sbd = nltk.tokenize.punkt.PunktSentenceTokenizer(learned)
    with open(PUNKT_FNAME, mode='wb') as f:
        pickle.dump(sbd, f, protocol=pickle.HIGHEST_PROTOCOL)
示例#3
0
def first_pass(fname, progress_count=None, max_pages=None):
    """Extract a Dictionary and create plain-text version of corpus."""
    logger = logging.getLogger('first_pass')

    wiki_size = os.path.getsize(fname)

    # Page task queues for parallel processing
    taskq = multiprocessing.Queue(MAX_QUEUE_ITEMS)
    doneq = multiprocessing.Queue(MAX_QUEUE_ITEMS)

    # Start worker processes
    logger.info('Starting workers')
    workers = []
    for i in range(NUMBER_OF_PROCESSES):
        p = multiprocessing.Process(target=first_pass_worker,
                                    args=(taskq, doneq))
        p.start()
        workers.append(p)

    # Start log writer process
    p = multiprocessing.Process(target=first_pass_writer, args=(doneq, fname))
    p.start()
    workers.append(p)

    # Process XML dump
    logger.info('Begining XML parse')

    wiki_size = os.path.getsize(fname)
    page_count = 0

    task_buff = []
    try:
        with open(fname, mode='rb') as wiki_dump:
            pages = page_generator(wiki_dump)
            for page in pages:
                task_buff.append(page)
                if len(task_buff) == CHUNK_SIZE:
                    taskq.put(task_buff)
                    task_buff = []
                page_count += 1
                if page_count == max_pages:
                    break
                if page_count % progress_count == 0:
                    print(page_count, page.start,
                          (page.start / wiki_size * 100),
                          taskq.qsize(), doneq.qsize(),
                          page.ID, page.title)
    except KeyboardInterrupt:
        print 'KeyboardInterrupt: Stopping the reading of the dump early!'
    finally:
        # Flush task buffer
        taskq.put(task_buff)
        task_buff = []
        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            taskq.put(None)

    logger.info('All done! Processed %s total pages.', page_count)

    # Wait for all child processes to stop (especially that writer!)
    for p in workers:
        p.join()
def first_pass(fname, progress_count=None, max_pages=None):
    """Extract a Dictionary and create plain-text version of corpus."""
    logger = logging.getLogger('first_pass')

    wiki_size = os.path.getsize(fname)

    # Page task queues for parallel processing
    taskq = multiprocessing.Queue(MAX_QUEUE_ITEMS)
    doneq = multiprocessing.Queue(MAX_QUEUE_ITEMS)

    # Start worker processes
    logger.info('Starting workers')
    workers = []
    for i in range(NUMBER_OF_PROCESSES):
        p = multiprocessing.Process(target=first_pass_worker,
                                    args=(taskq, doneq))
        p.start()
        workers.append(p)

    # Start log writer process
    p = multiprocessing.Process(target=first_pass_writer, args=(doneq, fname))
    p.start()
    workers.append(p)

    # Process XML dump
    logger.info('Begining XML parse')

    wiki_size = os.path.getsize(fname)
    page_count = 0

    task_buff = []
    try:
        with open(fname, mode='rb') as wiki_dump:
            pages = page_generator(wiki_dump)
            for page in pages:
                task_buff.append(page)
                if len(task_buff) == CHUNK_SIZE:
                    taskq.put(task_buff)
                    task_buff = []
                page_count += 1
                if page_count == max_pages:
                    break
                if page_count % progress_count == 0:
                    print(page_count, page.start,
                          (page.start / wiki_size * 100),
                          taskq.qsize(), doneq.qsize(),
                          page.ID, page.title)
    except KeyboardInterrupt:
        print 'KeyboardInterrupt: Stopping the reading of the dump early!'
    finally:
        # Flush task buffer
        taskq.put(task_buff)
        task_buff = []
        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            taskq.put(None)

    logger.info('All done! Processed %s total pages.', page_count)

    # Wait for all child processes to stop (especially that writer!)
    for p in workers:
        p.join()