def commit(self, **kwargs): writer = self.writer pool = writer.pool # Index the remaining documents in the doc buffer self._enqueue() # Tell the tasks to finish for task in self.tasks: self.jobqueue.put(None) # Wait for the tasks to finish for task in self.tasks: task.join() # Get the results results = [] for task in self.tasks: # runname, doccount, lenname results.append(self.resultqueue.get(timeout=5)) if results: for runname, doccount, lenname in results: f = writer.storage.open_file(lenname) lengths = Lengths.from_file(f, doccount) writer.lengths.add_other(lengths) writer.storage.delete_file(lenname) base = results[0][1] runreaders = [pool._read_run(results[0][0])] for runname, doccount, lenname in results[1:]: rr = self._read_and_renumber_run(runname, base) runreaders.append(rr) base += doccount writer.termswriter.add_iter(imerge(runreaders), writer.lengths)
def _merge_subsegments(self, results, sources, mergetype, optimize, merge): schema = self.schema storage = self.storage codec = self.codec fieldnames = list(schema.names()) # Merge per-document information pdw = self.perdocwriter # Names of fields that store term vectors vnames = set(schema.vector_names()) basedoc = self.docnum # A list to remember field length readers for each sub-segment (we'll # re-use them below) lenreaders = [pdw.lengths_reader()] for _, segment in results: # Create a field length reader for the sub-segment lenreader = codec.lengths_reader(storage, segment) # Remember it in the list for later lenreaders.append(lenreader) # Vector reader for the sub-segment vreader = codec.vector_reader(storage, segment) # Stored field reader for the sub-segment sfreader = codec.stored_fields_reader(storage, segment) # Iterating on the stored field reader yields a dictionary of # stored fields for *every* document in the segment (even if the # document has no stored fields it should yield {}) for i, fs in enumerate(sfreader): # Add the base doc count to the sub-segment doc num pdw.start_doc(basedoc + i) # Call add_field to store the field values and lengths for fieldname in fieldnames: value = fs.get(fieldname) length = lenreader.doc_field_length(i, fieldname) pdw.add_field(fieldname, schema[fieldname], value, length) # Copy over the vectors. TODO: would be much faster to bulk- # copy the postings for fieldname in vnames: if (i, fieldname) in vreader: field = schema[fieldname] vmatcher = vreader.matcher(i, fieldname, field.vector) pdw.add_vector_matcher(fieldname, field, vmatcher) pdw.finish_doc() basedoc += segment.doccount # Create a list of iterators from the run filenames basedoc = self.docnum for runname, segment in results: items = self._read_and_renumber_run(runname, basedoc) sources.append(items) basedoc += segment.doccount # Create a MultiLengths object combining the length files from the # subtask segments mlens = base.MultiLengths(lenreaders) # Merge the iterators into the field writer self.fieldwriter.add_postings(schema, mlens, imerge(sources)) self.docnum = basedoc
def _merge_subsegments(self, results, mergetype, optimize, merge): schema = self.schema storage = self.storage codec = self.codec fieldnames = list(schema.names()) # Merge per-document information pdw = self.perdocwriter # Names of fields that store term vectors vnames = set(schema.vector_names()) basedoc = self.docnum # A list to remember field length readers for each sub-segment (we'll # re-use them below) lenreaders = [pdw.lengths_reader()] for _, segment in results: # Create a field length reader for the sub-segment lenreader = codec.lengths_reader(storage, segment) # Remember it in the list for later lenreaders.append(lenreader) # Vector reader for the sub-segment vreader = None if schema.has_vectored_fields(): vreader = codec.vector_reader(storage, segment) # Stored field reader for the sub-segment sfreader = codec.stored_fields_reader(storage, segment) # Iterating on the stored field reader yields a dictionary of # stored fields for *every* document in the segment (even if the # document has no stored fields it should yield {}) for i, fs in enumerate(sfreader): # Add the base doc count to the sub-segment doc num pdw.start_doc(basedoc + i) # Call add_field to store the field values and lengths for fieldname in fieldnames: value = fs.get(fieldname) length = lenreader.doc_field_length(i, fieldname) pdw.add_field(fieldname, schema[fieldname], value, length) # Copy over the vectors. TODO: would be much faster to bulk- # copy the postings if vreader: for fieldname in vnames: if (i, fieldname) in vreader: field = schema[fieldname] vformat = field.vector vmatcher = vreader.matcher(i, fieldname, vformat) pdw.add_vector_matcher(fieldname, field, vmatcher) pdw.finish_doc() basedoc += segment.doccount # If information was added to this writer the conventional (e.g. # through add_reader or merging segments), add it as an extra source if self._added: sources = [self.pool.iter_postings()] else: sources = [] # Add iterators from the run filenames basedoc = self.docnum for runname, segment in results: items = self._read_and_renumber_run(runname, basedoc) sources.append(items) basedoc += segment.doccount # Create a MultiLengths object combining the length files from the # subtask segments mlens = base.MultiLengths(lenreaders) # Merge the iterators into the field writer self.fieldwriter.add_postings(schema, mlens, imerge(sources)) self.docnum = basedoc self._added = True