def __init__(self, ix, procs=None, batchsize=100, subargs=None, multisegment=False, **kwargs): # This is the "main" writer that will aggregate the results created by # the sub-tasks SegmentWriter.__init__(self, ix, **kwargs) self.procs = procs or cpu_count() # The maximum number of documents in each job file submitted to the # sub-tasks self.batchsize = batchsize # You can use keyword arguments or the "subargs" argument to pass # keyword arguments to the sub-writers self.subargs = subargs if subargs else kwargs # If multisegment is True, don't merge the segments created by the # sub-writers, just add them directly to the TOC self.multisegment = multisegment # A list to hold the sub-task Process objects self.tasks = [] # A queue to pass the filenames of job files to the sub-tasks self.jobqueue = Queue(self.procs * 4) # A queue to get back the final results of the sub-tasks self.resultqueue = Queue() # A buffer for documents before they are flushed to a job file self.docbuffer = [] self._grouping = 0 self._added_sub = False
def commit(self, mergetype=None, optimize=None, merge=None): if self._added_sub: # If documents have been added to sub-writers, use the parallel # merge commit code self._commit(mergetype, optimize, merge) else: # Otherwise, just do a regular-old commit SegmentWriter.commit(self, mergetype=mergetype, optimize=optimize, merge=merge)
def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs): SegmentWriter.__init__(self, ix, **kwargs) self.procs = procs or cpu_count() self.batchsize = batchsize self.subargs = subargs if subargs else kwargs self.tasks = [SegmentWriter(ix, _lk=False, **self.subargs) for _ in xrange(self.procs)] self.pointer = 0 self._added_sub = False
def writer(self, procs=1, **kwargs): if procs > 1: from whoosh.multiproc import MpWriter return MpWriter(self, procs=procs, **kwargs) else: from whoosh.writing import SegmentWriter return SegmentWriter(self, **kwargs)
def run(self): # This is the main loop of the process. OK, so the way this works is # kind of brittle and stupid, but I had to figure out how to use the # multiprocessing module, work around bugs, and address performance # issues, so there is at least some reasoning behind some of this # The "parent" task farms individual documents out to the subtasks for # indexing. You could pickle the actual documents and put them in the # queue, but that is not very performant. Instead, we assume the tasks # share a filesystem and use that to pass the information around. The # parent task writes a certain number of documents to a file, then puts # the filename on the "job queue". A subtask gets the filename off the # queue and reads through the file processing the documents. jobqueue = self.jobqueue resultqueue = self.resultqueue multisegment = self.multisegment # Open a placeholder object representing the index ix = self.storage.open_index(self.indexname) # Open a writer for the index. The _lk=False parameter means to not try # to lock the index (the parent object that started me takes care of # locking the index) writer = self.writer = SegmentWriter(ix, _lk=False, **self.kwargs) # If the parent task calls cancel() on me, it will set self.running to # False, so I'll notice the next time through the loop while self.running: # Take an object off the job queue jobinfo = jobqueue.get() # If the object is None, it means the parent task wants me to # finish up if jobinfo is None: break # The object from the queue is a tuple of (filename, # number_of_docs_in_file). Pass those two pieces of information as # arguments to _process_file(). self._process_file(*jobinfo) # jobqueue.task_done() if not self.running: # I was cancelled, so I'll cancel my underlying writer writer.cancel() else: if multisegment: # Actually finish the segment and return it with no run runname = None fieldnames = writer.pool.fieldnames segment = writer._finalize_segment() else: # Merge all runs in the writer's pool into one run, close the # segment, and return the run name and the segment k = self.kwargs.get("k", 64) runname, fieldnames, segment = finish_subsegment(writer, k) # Put the results (the run filename and the segment object) on the # result queue resultqueue.put((runname, fieldnames, segment), timeout=5)
def cancel(self): try: for task in self.tasks: task.cancel() finally: SegmentWriter.cancel(self)