def test_stored_fields(): with TempStorage("storedfields") as st: sf = st.create_file("test.sf") sfw = StoredFieldWriter(sf, ["a", "b"]) sfw.append({"a": "hello", "b": "there"}) sfw.append({"a": "one", "b": "two"}) sfw.append({"a": "alfa", "b": "bravo"}) sfw.close() sf = st.open_file("test.sf") sfr = StoredFieldReader(sf) assert_equal(sfr[0], {"a": "hello", "b": "there"}) assert_equal(sfr[2], {"a": "alfa", "b": "bravo"}) assert_equal(sfr[1], {"a": "one", "b": "two"}) sfr.close()
def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, _l=True, **poolargs): self.writelock = None if _l: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.readlock = ix.lock("READLOCK") info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or Segment.basename(self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, self.generation, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) ti = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) pw = FilePostingWriter(pf, blocklimit=blocklimit) # Terms writer self.termswriter = TermsWriter(self.schema, ti, pw) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs)
class SegmentWriter(IndexWriter): def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, _l=True, **poolargs): self.writelock = None if _l: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.readlock = ix.lock("READLOCK") info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or Segment.basename(self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, self.generation, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) ti = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) pw = FilePostingWriter(pf, blocklimit=blocklimit) # Terms writer self.termswriter = TermsWriter(self.schema, ti, pw) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs) def _check_state(self): if self.is_closed: raise IndexingError("This writer is closed") def add_field(self, fieldname, fieldspec): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") super(SegmentWriter, self).add_field(fieldname, fieldspec) def remove_field(self, fieldname): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") super(SegmentWriter, self).remove_field(fieldname) def _document_segment(self, docnum): #Returns the index.Segment object containing the given document #number. offsets = self._doc_offsets if len(offsets) == 1: return 0 return bisect_right(offsets, docnum) - 1 def _segment_and_docnum(self, docnum): #Returns an (index.Segment, segment_docnum) pair for the segment #containing the given document number. segmentnum = self._document_segment(docnum) offset = self._doc_offsets[segmentnum] segment = self.segments[segmentnum] return segment, docnum - offset def has_deletions(self): """ :returns: True if this index has documents that are marked deleted but haven't been optimized out of the index yet. """ return any(s.has_deletions() for s in self.segments) def delete_document(self, docnum, delete=True): self._check_state() if docnum >= sum(seg.doccount for seg in self.segments): raise IndexingError("No document ID %r in this index" % docnum) segment, segdocnum = self._segment_and_docnum(docnum) segment.delete_document(segdocnum, delete=delete) def deleted_count(self): """ :returns: the total number of deleted documents in the index. """ return sum(s.deleted_count() for s in self.segments) def is_deleted(self, docnum): segment, segdocnum = self._segment_and_docnum(docnum) return segment.is_deleted(segdocnum) def reader(self, reuse=None): self._check_state() from whoosh.filedb.fileindex import FileIndex return FileIndex._reader(self.storage, self.schema, self.segments, self.generation, reuse=reuse) def add_reader(self, reader): self._check_state() startdoc = self.docnum has_deletions = reader.has_deletions() if has_deletions: docmap = {} fieldnames = set(self.schema.names()) # Add stored documents, vectors, and field lengths for docnum in reader.all_doc_ids(): if (not has_deletions) or (not reader.is_deleted(docnum)): d = dict(item for item in reader.stored_fields(docnum).iteritems() if item[0] in fieldnames) # We have to append a dictionary for every document, even if # it's empty. self.storedfields.append(d) if has_deletions: docmap[docnum] = self.docnum for fieldname, length in reader.doc_field_lengths(docnum): if fieldname in fieldnames: self.pool.add_field_length(self.docnum, fieldname, length) for fieldname in reader.schema.vector_names(): if (fieldname in fieldnames and reader.has_vector(docnum, fieldname)): vpostreader = reader.vector(docnum, fieldname) self._add_vector_reader(self.docnum, fieldname, vpostreader) self.docnum += 1 for fieldname, text, _, _ in reader: if fieldname in fieldnames: postreader = reader.postings(fieldname, text) while postreader.is_active(): docnum = postreader.id() valuestring = postreader.value() if has_deletions: newdoc = docmap[docnum] else: newdoc = startdoc + docnum self.pool.add_posting(fieldname, text, newdoc, postreader.weight(), valuestring) postreader.next() self._added = True def add_document(self, **fields): #from whoosh.util import now #t = now() self._check_state() schema = self.schema # Sort the keys fieldnames = sorted([name for name in fields.keys() if not name.startswith("_")]) # Check if the caller gave us a bogus field for name in fieldnames: if name not in schema: raise UnknownFieldError("No field named %r in %s" % (name, schema)) storedvalues = {} docnum = self.docnum for fieldname in fieldnames: value = fields.get(fieldname) if value is not None: field = schema[fieldname] if field.indexed: self.pool.add_content(docnum, fieldname, field, value) vformat = field.vector if vformat: vlist = sorted((w, weight, valuestring) for w, freq, weight, valuestring in vformat.word_values(value, mode="index")) self._add_vector(docnum, fieldname, vlist) if field.stored: # Caller can override the stored value by including a key # _stored_<fieldname> storedvalue = value storedname = "_stored_" + fieldname if storedname in fields: storedvalue = fields[storedname] storedvalues[fieldname] = storedvalue self._added = True self.storedfields.append(storedvalues) self.docnum += 1 #print "%f" % (now() - t) #def update_document(self, **fields): def _add_vector(self, docnum, fieldname, vlist): vpostwriter = self.vpostwriter offset = vpostwriter.start(self.schema[fieldname].vector) for text, weight, valuestring in vlist: assert isinstance(text, unicode), "%r is not unicode" % text vpostwriter.write(text, weight, valuestring, 0) vpostwriter.finish() self.vectorindex.add((docnum, fieldname), offset) def _add_vector_reader(self, docnum, fieldname, vreader): vpostwriter = self.vpostwriter offset = vpostwriter.start(self.schema[fieldname].vector) while vreader.is_active(): # text, weight, valuestring, fieldlen vpostwriter.write(vreader.id(), vreader.weight(), vreader.value(), 0) vreader.next() vpostwriter.finish() self.vectorindex.add((docnum, fieldname), offset) def _close_all(self): self.is_closed = True self.termswriter.close() self.storedfields.close() if not self.lengthfile.is_closed: self.lengthfile.close() if self.vectorindex: self.vectorindex.close() if self.vpostwriter: self.vpostwriter.close() def _getsegment(self): return Segment(self.name, self.generation, self.docnum, self.pool.fieldlength_totals(), self.pool.fieldlength_maxes()) def commit(self, mergetype=None, optimize=False, merge=True): """Finishes writing and saves all additions and changes to disk. There are four possible ways to use this method:: # Merge small segments but leave large segments, trying to # balance fast commits with fast searching: writer.commit() # Merge all segments into a single segment: writer.commit(optimize=True) # Don't merge any existing segments: writer.commit(merge=False) # Use a custom merge function writer.commit(mergetype=my_merge_function) :param mergetype: a custom merge function taking a Writer object and segment list as arguments, and returning a new segment list. If you supply a ``mergetype`` function, the values of the ``optimize`` and ``merge`` arguments are ignored. :param optimize: if True, all existing segments are merged with the documents you've added to this writer (and the value of the ``merge`` argument is ignored). :param merge: if False, do not merge small segments. """ self._check_state() try: if mergetype: pass elif optimize: mergetype = OPTIMIZE elif not merge: mergetype = NO_MERGE else: mergetype = MERGE_SMALL # Call the merge policy function. The policy may choose to merge other # segments into this writer's pool new_segments = mergetype(self, self.segments) # Tell the pool we're finished adding information, it should add its # accumulated data to the lengths, terms index, and posting files. if self._added: self.pool.finish(self.termswriter, self.docnum, self.lengthfile) # Create a Segment object for the segment created by this writer and # add it to the list of remaining segments returned by the merge policy # function new_segments.append(self._getsegment()) else: self.pool.cleanup() # Close all files, write a new TOC with the new segment list, and # release the lock. self._close_all() from whoosh.filedb.fileindex import _write_toc, _clean_files _write_toc(self.storage, self.schema, self.indexname, self.generation, self.segment_number, new_segments) self.readlock.acquire(True) try: _clean_files(self.storage, self.indexname, self.generation, new_segments) finally: self.readlock.release() finally: if self.writelock: self.writelock.release() def cancel(self): self._check_state() try: self.pool.cancel() self._close_all() finally: if self.writelock: self.writelock.release()