예제 #1
0
    def __init__(self, ix, procs=None, batchsize=100, subargs=None,
                 multisegment=False, **kwargs):
        # This is the "main" writer that will aggregate the results created by
        # the sub-tasks
        SegmentWriter.__init__(self, ix, **kwargs)

        self.procs = procs or cpu_count()
        # The maximum number of documents in each job file submitted to the
        # sub-tasks
        self.batchsize = batchsize
        # You can use keyword arguments or the "subargs" argument to pass
        # keyword arguments to the sub-writers
        self.subargs = subargs if subargs else kwargs
        # If multisegment is True, don't merge the segments created by the
        # sub-writers, just add them directly to the TOC
        self.multisegment = multisegment

        # A list to hold the sub-task Process objects
        self.tasks = []
        # A queue to pass the filenames of job files to the sub-tasks
        self.jobqueue = Queue(self.procs * 4)
        # A queue to get back the final results of the sub-tasks
        self.resultqueue = Queue()
        # A buffer for documents before they are flushed to a job file
        self.docbuffer = []

        self._grouping = 0
        self._added_sub = False
예제 #2
0
    def __init__(self,
                 ix,
                 procs=None,
                 batchsize=100,
                 subargs=None,
                 multisegment=False,
                 **kwargs):
        # This is the "main" writer that will aggregate the results created by
        # the sub-tasks
        SegmentWriter.__init__(self, ix, **kwargs)

        self.procs = procs or cpu_count()
        # The maximum number of documents in each job file submitted to the
        # sub-tasks
        self.batchsize = batchsize
        # You can use keyword arguments or the "subargs" argument to pass
        # keyword arguments to the sub-writers
        self.subargs = subargs if subargs else kwargs
        # If multisegment is True, don't merge the segments created by the
        # sub-writers, just add them directly to the TOC
        self.multisegment = multisegment

        # A list to hold the sub-task Process objects
        self.tasks = []
        # A queue to pass the filenames of job files to the sub-tasks
        self.jobqueue = Queue(self.procs * 4)
        # A queue to get back the final results of the sub-tasks
        self.resultqueue = Queue()
        # A buffer for documents before they are flushed to a job file
        self.docbuffer = []

        self._grouping = 0
        self._added_sub = False
예제 #3
0
 def commit(self, mergetype=None, optimize=False, merge=True):
     if self._added_sub:
         # If documents have been added to sub-writers, use the parallel
         # merge commit code
         self._commit(mergetype, optimize, merge)
     else:
         # Otherwise, just do a regular-old commit
         SegmentWriter.commit(self, mergetype=mergetype, optimize=optimize, merge=merge)
예제 #4
0
    def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs):
        SegmentWriter.__init__(self, ix, **kwargs)

        self.procs = procs or cpu_count()
        self.batchsize = batchsize
        self.subargs = subargs if subargs else kwargs
        self.tasks = [SegmentWriter(ix, _lk=False, **self.subargs) for _ in xrange(self.procs)]
        self.pointer = 0
        self._added_sub = False
예제 #5
0
 def commit(self, mergetype=None, optimize=False, merge=True):
     if self._added_sub:
         # If documents have been added to sub-writers, use the parallel
         # merge commit code
         self._commit(mergetype, optimize, merge)
     else:
         # Otherwise, just do a regular-old commit
         SegmentWriter.commit(self,
                              mergetype=mergetype,
                              optimize=optimize,
                              merge=merge)
예제 #6
0
    def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs):
        SegmentWriter.__init__(self, ix, **kwargs)

        self.procs = procs or cpu_count()
        self.batchsize = batchsize
        self.subargs = subargs if subargs else kwargs
        self.tasks = [
            SegmentWriter(ix, _lk=False, **self.subargs)
            for _ in xrange(self.procs)
        ]
        self.pointer = 0
        self._added_sub = False
예제 #7
0
 def writer(self, procs=1, **kwargs):
     if procs > 1:
         from whoosh.filedb.multiproc2 import MpWriter
         return MpWriter(self, **kwargs)
     else:
         from whoosh.filedb.filewriting import SegmentWriter
         return SegmentWriter(self, **kwargs)
예제 #8
0
    def run(self):
        # This is the main loop of the process. OK, so the way this works is
        # kind of brittle and stupid, but I had to figure out how to use the
        # multiprocessing module, work around bugs, and address performance
        # issues, so there is at least some reasoning behind some of this

        # The "parent" task farms individual documents out to the subtasks for
        # indexing. You could pickle the actual documents and put them in the
        # queue, but that is not very performant. Instead, we assume the tasks
        # share a filesystem and use that to pass the information around. The
        # parent task writes a certain number of documents to a file, then puts
        # the filename on the "job queue". A subtask gets the filename off the
        # queue and reads through the file processing the documents.

        jobqueue = self.jobqueue
        resultqueue = self.resultqueue
        multisegment = self.multisegment

        # Open a placeholder object representing the index
        ix = self.storage.open_index(self.indexname)
        # Open a writer for the index. The _lk=False parameter means to not try
        # to lock the index (the parent object that started me takes care of
        # locking the index)
        writer = self.writer = SegmentWriter(ix, _lk=False, **self.kwargs)

        # If the parent task calls cancel() on me, it will set self.running to
        # False, so I'll notice the next time through the loop
        while self.running:
            # Take an object off the job queue
            jobinfo = jobqueue.get()
            # If the object is None, it means the parent task wants me to
            # finish up
            if jobinfo is None:
                break
            # The object from the queue is a tuple of (filename,
            # number_of_docs_in_file). Pass those two pieces of information as
            # arguments to _process_file().
            self._process_file(*jobinfo)

        if not self.running:
            # I was cancelled, so I'll cancel my underlying writer
            writer.cancel()
        else:
            if multisegment:
                # Actually finish the segment and return it with no run
                runname = None
                writer._flush_segment()
                writer._close_segment()
                writer._assemble_segment()
                segment = writer.get_segment()
            else:
                # Merge all runs in the writer's pool into one run, close the
                # segment, and return the run name and the segment
                k = self.kwargs.get("k", 64)
                runname, segment = finish_subsegment(writer, k)

            # Put the results (the run filename and the segment object) on the
            # result queue
            resultqueue.put((runname, segment), timeout=5)
예제 #9
0
 def run(self):
     pqueue = self.postingqueue
     
     index = self.storage.open_index(self.indexname)
     writer = SegmentWriter(index, name=self.segmentname, lock=False, **self.kwargs)
     
     while self.running:
         args = pqueue.get()
         if args is None:
             break
         
         writer.add_document(**args)
     
     if not self.running:
         writer.cancel()
         self.terminate()
     else:
         writer.pool.finish(writer.docnum, writer.lengthfile,
                            writer.termsindex, writer.postwriter)
         self._segment = writer._getsegment()
예제 #10
0
    def run(self):
        jobqueue = self.jobqueue
        resultqueue = self.resultqueue
        ix = self.storage.open_index(self.indexname)
        writer = self.writer = SegmentWriter(ix, _lk=False, **self.kwargs)

        while self.running:
            jobinfo = jobqueue.get()
            if jobinfo is None:
                break
            self._process_file(*jobinfo)

        if not self.running:
            writer.cancel()
        else:
            writer.pool.save()
            writer.pool.reduce_to(1, self.kwargs.get("k", 64))
            runname = writer.pool.runs[0]
            doccount = writer.doc_count()
            lenname, lenfile = self.storage.create_temp()
            writer.lengths.to_file(lenfile, doccount)
            resultqueue.put((runname, doccount, lenname), timeout=5)
예제 #11
0
    def run(self):
        jobqueue = self.jobqueue
        ix = self.storage.open_index(self.indexname)
        writer = self.writer = SegmentWriter(ix,
                                             _lk=False,
                                             name=self.segname,
                                             **self.kwargs)

        if self.firstjob:
            self._add_file(self.firstjob)

        while self.running:
            args = jobqueue.get()
            if args is None:
                break
            self._add_file(args)

        if not self.running:
            writer.cancel()
        else:
            writer.pool.finish(writer.termswriter, writer.docnum,
                               writer.lengthfile)
            writer._close_all()
            self.resultqueue.put(writer._getsegment())
예제 #12
0
 def cancel(self):
     try:
         for task in self.tasks:
             task.cancel()
     finally:
         SegmentWriter.cancel(self)
예제 #13
0
 def cancel(self):
     try:
         for task in self.tasks:
             task.cancel()
     finally:
         SegmentWriter.cancel(self)
예제 #14
0
 def commit(self, mergetype=None, optimize=False, merge=True):
     if self._added_sub:
         self._commit(mergetype, optimize, merge)
     else:
         SegmentWriter.commit(self, mergetype=mergetype, optimize=optimize,
                              merge=merge)