def test_stored_fields(): codec = default_codec() fieldobj = fields.TEXT(stored=True) with TempStorage("storedfields") as st: seg = codec.new_segment(st, "test") dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("a", fieldobj, "hello", 1) dw.add_field("b", fieldobj, "there", 1) dw.finish_doc() dw.start_doc(1) dw.add_field("a", fieldobj, "one", 1) dw.add_field("b", fieldobj, "two", 1) dw.add_field("c", fieldobj, "three", 1) dw.finish_doc() dw.start_doc(2) dw.finish_doc() dw.start_doc(3) dw.add_field("a", fieldobj, "alfa", 1) dw.add_field("b", fieldobj, "bravo", 1) dw.finish_doc() dw.close() seg.set_doc_count(4) pdr = codec.per_document_reader(st, seg) assert pdr.doc_count_all() == 4 assert pdr.stored_fields(0) == {"a": "hello", "b": "there"} # Note: access out of order assert pdr.stored_fields(3), {"a": "alfa", "b": "bravo"} assert pdr.stored_fields(1) == {"a": "one", "b": "two", "c": "three"} sfs = list(pdr.all_stored_fields()) assert len(sfs) == 4 assert sfs == [ { "a": "hello", "b": "there" }, { "a": "one", "b": "two", "c": "three" }, {}, { "a": "alfa", "b": "bravo" }, ] pdr.close()
def test_stored_fields(): codec = default_codec() fieldobj = fields.TEXT(stored=True) with TempStorage("storedfields") as st: seg = codec.new_segment(st, "test") dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("a", fieldobj, "hello", 1) dw.add_field("b", fieldobj, "there", 1) dw.finish_doc() dw.start_doc(1) dw.add_field("a", fieldobj, "one", 1) dw.add_field("b", fieldobj, "two", 1) dw.add_field("c", fieldobj, "three", 1) dw.finish_doc() dw.start_doc(2) dw.finish_doc() dw.start_doc(3) dw.add_field("a", fieldobj, "alfa", 1) dw.add_field("b", fieldobj, "bravo", 1) dw.finish_doc() dw.close() dr = codec.stored_fields_reader(st, seg) assert_equal(dr[0], {"a": "hello", "b": "there"}) # Note: access out of order assert_equal(dr[3], {"a": "alfa", "b": "bravo"}) assert_equal(dr[1], {"a": "one", "b": "two", "c": "three"}) dr.close() dr = codec.stored_fields_reader(st, seg) sfs = list(dr) assert_equal(sfs, [ { "a": "hello", "b": "there" }, { "a": "one", "b": "two", "c": "three" }, {}, { "a": "alfa", "b": "bravo" }, ]) dr.close()
def __init__(self, ix, poolclass=None, timeout=0.0, delay=0.1, _lk=True, limitmb=128, docbase=0, codec=None, compound=True, **kwargs): # Lock the index self.writelock = None if _lk: self.writelock = ix.lock("WRITELOCK") if not try_for( self.writelock.acquire, timeout=timeout, delay=delay): raise LockError if codec is None: from whoosh.codec import default_codec codec = default_codec() self.codec = codec # Get info from the index self.storage = ix.storage self.indexname = ix.indexname info = ix._read_toc() self.generation = info.generation + 1 self.schema = info.schema self.segments = info.segments self.docnum = self.docbase = docbase self._setup_doc_offsets() # Internals self._tempstorage = self.storage.temp_storage("%s.tmp" % self.indexname) newsegment = codec.new_segment(self.storage, self.indexname) self.newsegment = newsegment self.compound = compound and newsegment.should_assemble() self.is_closed = False self._added = False self.pool = PostingPool(self._tempstorage, self.newsegment, limitmb=limitmb) # Set up writers self.perdocwriter = codec.per_document_writer(self.storage, newsegment) self.fieldwriter = codec.field_writer(self.storage, newsegment) self.merge = True self.optimize = False self.mergetype = None
def test_stored_fields(): codec = default_codec() fieldobj = fields.TEXT(stored=True) with TempStorage("storedfields") as st: seg = codec.new_segment(st, "test") dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("a", fieldobj, "hello", 1) dw.add_field("b", fieldobj, "there", 1) dw.finish_doc() dw.start_doc(1) dw.add_field("a", fieldobj, "one", 1) dw.add_field("b", fieldobj, "two", 1) dw.add_field("c", fieldobj, "three", 1) dw.finish_doc() dw.start_doc(2) dw.finish_doc() dw.start_doc(3) dw.add_field("a", fieldobj, "alfa", 1) dw.add_field("b", fieldobj, "bravo", 1) dw.finish_doc() dw.close() seg.set_doc_count(4) pdr = codec.per_document_reader(st, seg) assert pdr.doc_count_all() == 4 assert pdr.stored_fields(0) == {"a": "hello", "b": "there"} # Note: access out of order assert pdr.stored_fields(3), {"a": "alfa", "b": "bravo"} assert pdr.stored_fields(1) == {"a": "one", "b": "two", "c": "three"} sfs = list(pdr.all_stored_fields()) assert len(sfs) == 4 assert sfs == [{"a": "hello", "b": "there"}, {"a": "one", "b": "two", "c": "three"}, {}, {"a": "alfa", "b": "bravo"}, ] pdr.close()
def __init__(self, ix, poolclass=None, timeout=0.0, delay=0.1, _lk=True, limitmb=128, docbase=0, codec=None, compound=True, **kwargs): # Lock the index self.writelock = None if _lk: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError if codec is None: from whoosh.codec import default_codec codec = default_codec() self.codec = codec # Get info from the index self.storage = ix.storage self.indexname = ix.indexname info = ix._read_toc() self.generation = info.generation + 1 self.schema = info.schema self.segments = info.segments self.docnum = self.docbase = docbase self._setup_doc_offsets() # Internals self._tempstorage = self.storage.temp_storage("%s.tmp" % self.indexname) newsegment = codec.new_segment(self.storage, self.indexname) self.newsegment = newsegment self.compound = compound and newsegment.should_assemble() self.is_closed = False self._added = False self.pool = PostingPool(self._tempstorage, self.newsegment, limitmb=limitmb) # Set up writers self.perdocwriter = codec.per_document_writer(self.storage, newsegment) self.fieldwriter = codec.field_writer(self.storage, newsegment) self.merge = True self.optimize = False self.mergetype = None
def __init__(self, storage, schema, segment, generation=None, codec=None): self.storage = storage self.schema = schema self.segment = segment self._gen = generation self.is_closed = False # Copy info from underlying segment self._has_deletions = segment.has_deletions() self._dc = segment.doc_count() self._dc_all = segment.doc_count_all() if hasattr(self.segment, "segment_id"): self.segid = self.segment.segment_id() else: from whoosh.codec.base import Segment self.segid = Segment._random_id() # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # cahces) if the segment is in a compound file. if segment.is_compound(): # Use an overlay here instead of just the compound storage because # in rare circumstances a segment file may be added after the # segment is written self.files = OverlayStorage(segment.open_compound_file(storage), self.storage) else: self.files = storage # Get microreaders from codec if codec is None: from whoosh.codec import default_codec codec = default_codec() self._codec = codec self._terms = codec.terms_reader(self.files, self.segment) self._lengths = codec.lengths_reader(self.files, self.segment) self._stored = codec.stored_fields_reader(self.files, self.segment) self._vectors = None # Lazy open with self._open_vectors() self._graph = None # Lazy open with self._open_dawg() self.set_caching_policy()
def _roundtrip(content, format_, astype, ana=None): with TempStorage("roundtrip") as st: codec = default_codec() seg = codec.new_segment(st, "") ana = ana or analysis.StandardAnalyzer() field = fields.FieldType(format=format_, analyzer=ana) fw = codec.field_writer(st, seg) fw.start_field("f1", field) for text, _, weight, valuestring in sorted(field.index(content)): fw.start_term(text) fw.add(0, weight, valuestring, None) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) ps = [] for fieldname, btext in tr.terms(): m = tr.matcher(fieldname, btext, format_) ps.append((field.from_bytes(btext), m.value_as(astype))) tr.close() return ps
def _make_codec(**kwargs): st = RamStorage() codec = default_codec(**kwargs) seg = codec.new_segment(st, "test") return st, codec, seg