Python SegmentWriter примеры использования

Язык программирования: Python

Пространство имен/Пакет: whoosh.writing

Класс/Тип: SegmentWriter

Примеров на hotexamples.com: 10

Python SegmentWriter - 10 примеров найдено. Это лучшие примеры Python кода для whoosh.writing.SegmentWriter, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

SegmentWriter(3)

__init__(2)

cancel(1)

commit(1)

Пример #1

Показать файл

Файл: multiproc.py Проект: 32footsteps/SpecialCollectionsProject

    def __init__(self, ix, procs=None, batchsize=100, subargs=None,
                 multisegment=False, **kwargs):
        # This is the "main" writer that will aggregate the results created by
        # the sub-tasks
        SegmentWriter.__init__(self, ix, **kwargs)

        self.procs = procs or cpu_count()
        # The maximum number of documents in each job file submitted to the
        # sub-tasks
        self.batchsize = batchsize
        # You can use keyword arguments or the "subargs" argument to pass
        # keyword arguments to the sub-writers
        self.subargs = subargs if subargs else kwargs
        # If multisegment is True, don't merge the segments created by the
        # sub-writers, just add them directly to the TOC
        self.multisegment = multisegment

        # A list to hold the sub-task Process objects
        self.tasks = []
        # A queue to pass the filenames of job files to the sub-tasks
        self.jobqueue = Queue(self.procs * 4)
        # A queue to get back the final results of the sub-tasks
        self.resultqueue = Queue()
        # A buffer for documents before they are flushed to a job file
        self.docbuffer = []

        self._grouping = 0
        self._added_sub = False

Пример #2

Показать файл

Файл: multiproc.py Проект: timeoutdigital/whoosh

    def __init__(self, ix, procs=None, batchsize=100, subargs=None,
                 multisegment=False, **kwargs):
        # This is the "main" writer that will aggregate the results created by
        # the sub-tasks
        SegmentWriter.__init__(self, ix, **kwargs)

        self.procs = procs or cpu_count()
        # The maximum number of documents in each job file submitted to the
        # sub-tasks
        self.batchsize = batchsize
        # You can use keyword arguments or the "subargs" argument to pass
        # keyword arguments to the sub-writers
        self.subargs = subargs if subargs else kwargs
        # If multisegment is True, don't merge the segments created by the
        # sub-writers, just add them directly to the TOC
        self.multisegment = multisegment

        # A list to hold the sub-task Process objects
        self.tasks = []
        # A queue to pass the filenames of job files to the sub-tasks
        self.jobqueue = Queue(self.procs * 4)
        # A queue to get back the final results of the sub-tasks
        self.resultqueue = Queue()
        # A buffer for documents before they are flushed to a job file
        self.docbuffer = []

        self._grouping = 0
        self._added_sub = False

Пример #3

Показать файл

Файл: multiproc.py Проект: MNI-NIL/NIL-MNI.github.io

 def commit(self, mergetype=None, optimize=None, merge=None):
     if self._added_sub:
         # If documents have been added to sub-writers, use the parallel
         # merge commit code
         self._commit(mergetype, optimize, merge)
     else:
         # Otherwise, just do a regular-old commit
         SegmentWriter.commit(self, mergetype=mergetype, optimize=optimize, merge=merge)

Пример #4

Показать файл

Файл: multiproc.py Проект: timeoutdigital/whoosh

 def commit(self, mergetype=None, optimize=None, merge=None):
     if self._added_sub:
         # If documents have been added to sub-writers, use the parallel
         # merge commit code
         self._commit(mergetype, optimize, merge)
     else:
         # Otherwise, just do a regular-old commit
         SegmentWriter.commit(self, mergetype=mergetype, optimize=optimize,
                              merge=merge)

Пример #5

Показать файл

Файл: multiproc.py Проект: MNI-NIL/NIL-MNI.github.io

    def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs):
        SegmentWriter.__init__(self, ix, **kwargs)

        self.procs = procs or cpu_count()
        self.batchsize = batchsize
        self.subargs = subargs if subargs else kwargs
        self.tasks = [SegmentWriter(ix, _lk=False, **self.subargs) for _ in xrange(self.procs)]
        self.pointer = 0
        self._added_sub = False

Пример #6

Показать файл

Файл: multiproc.py Проект: timeoutdigital/whoosh

    def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs):
        SegmentWriter.__init__(self, ix, **kwargs)

        self.procs = procs or cpu_count()
        self.batchsize = batchsize
        self.subargs = subargs if subargs else kwargs
        self.tasks = [SegmentWriter(ix, _lk=False, **self.subargs)
                      for _ in xrange(self.procs)]
        self.pointer = 0
        self._added_sub = False

Пример #7

Показать файл

 def writer(self, procs=1, **kwargs):
     if procs > 1:
         from whoosh.multiproc import MpWriter
         return MpWriter(self, procs=procs, **kwargs)
     else:
         from whoosh.writing import SegmentWriter
         return SegmentWriter(self, **kwargs)

Пример #8

Показать файл

Файл: multiproc.py Проект: timeoutdigital/whoosh

    def run(self):
        # This is the main loop of the process. OK, so the way this works is
        # kind of brittle and stupid, but I had to figure out how to use the
        # multiprocessing module, work around bugs, and address performance
        # issues, so there is at least some reasoning behind some of this

        # The "parent" task farms individual documents out to the subtasks for
        # indexing. You could pickle the actual documents and put them in the
        # queue, but that is not very performant. Instead, we assume the tasks
        # share a filesystem and use that to pass the information around. The
        # parent task writes a certain number of documents to a file, then puts
        # the filename on the "job queue". A subtask gets the filename off the
        # queue and reads through the file processing the documents.

        jobqueue = self.jobqueue
        resultqueue = self.resultqueue
        multisegment = self.multisegment

        # Open a placeholder object representing the index
        ix = self.storage.open_index(self.indexname)
        # Open a writer for the index. The _lk=False parameter means to not try
        # to lock the index (the parent object that started me takes care of
        # locking the index)
        writer = self.writer = SegmentWriter(ix, _lk=False, **self.kwargs)

        # If the parent task calls cancel() on me, it will set self.running to
        # False, so I'll notice the next time through the loop
        while self.running:
            # Take an object off the job queue
            jobinfo = jobqueue.get()
            # If the object is None, it means the parent task wants me to
            # finish up
            if jobinfo is None:
                break
            # The object from the queue is a tuple of (filename,
            # number_of_docs_in_file). Pass those two pieces of information as
            # arguments to _process_file().
            self._process_file(*jobinfo)
            # jobqueue.task_done()

        if not self.running:
            # I was cancelled, so I'll cancel my underlying writer
            writer.cancel()
        else:
            if multisegment:
                # Actually finish the segment and return it with no run
                runname = None
                fieldnames = writer.pool.fieldnames
                segment = writer._finalize_segment()
            else:
                # Merge all runs in the writer's pool into one run, close the
                # segment, and return the run name and the segment
                k = self.kwargs.get("k", 64)
                runname, fieldnames, segment = finish_subsegment(writer, k)

            # Put the results (the run filename and the segment object) on the
            # result queue
            resultqueue.put((runname, fieldnames, segment), timeout=5)

Пример #9

Показать файл

Файл: multiproc.py Проект: 32footsteps/SpecialCollectionsProject

 def cancel(self):
     try:
         for task in self.tasks:
             task.cancel()
     finally:
         SegmentWriter.cancel(self)

Пример #10

Показать файл

Файл: multiproc.py Проект: timeoutdigital/whoosh

 def cancel(self):
     try:
         for task in self.tasks:
             task.cancel()
     finally:
         SegmentWriter.cancel(self)