def __init__(self, ix, postlimit, blocklimit, name=None): """ :param ix: the Index object in which to write the new segment. :param postlimit: the maximum size for a run in the posting pool. :param blocklimit: the maximum number of postings in a posting block. :param name: the name of the segment. """ self.index = ix self.schema = ix.schema self.storage = storage = ix.storage self.name = name or ix._next_segment_name() self.max_doc = 0 self.pool = postpool.PostingPool(postlimit) # Create mappings of field numbers to the position of that field in the # lists of scorable and stored fields. For example, consider a schema # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary # would then map B -> 0, D -> 1, and E -> 2. self._scorable_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields())) self._stored_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.stored_fields())) # Create a temporary segment object just so we can access its # *_filename attributes (so if we want to change the naming convention, # we only have to do it in one place). tempseg = Segment(self.name, 0, 0, None) self.termtable = create_terms(storage, tempseg) self.docslist = create_storedfields(storage, tempseg) self.doclengths = None if self.schema.scorable_fields(): self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos)) postfile = storage.create_file(tempseg.posts_filename) self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit) self.vectortable = None if self.schema.has_vectored_fields(): # Table associating document fields with (postoffset, postcount) self.vectortable = create_vectors(storage, tempseg) vpostfile = storage.create_file(tempseg.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpostfile, stringids=True) # Keep track of the total number of tokens (across all docs) # in each field self.field_length_totals = defaultdict(int)
def test_huge_postfile(): with TempStorage("hugeindex") as st: pf = st.create_file("test.pst") gb5 = 5 * 1024 * 1024 * 1024 pf.seek(gb5) pf.write("\x00\x00\x00\x00") assert_equal(pf.tell(), gb5 + 4) fpw = FilePostingWriter(pf) format = formats.Frequency(None) offset = fpw.start(format) for i in xrange(10): fpw.write(i, float(i), struct.pack("!I", i), 10) posttotal = fpw.finish() assert_equal(posttotal, 10) fpw.close() pf = st.open_file("test.pst") pfr = FilePostingReader(pf, offset, format) i = 0 while pfr.is_active(): assert_equal(pfr.id(), i) assert_equal(pfr.weight(), float(i)) assert_equal(pfr.value(), struct.pack("!I", i)) pfr.next() i += 1 pf.close()
def roundtrip(self, postings, format, astype): postfile = self.make_file(astype) readback = None try: fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, value in postings: fpw.write(id, format.encode(value)) fpw.close() postfile = self.open_file(astype) fpr = FilePostingReader(postfile, 0, format) readback = list(fpr.all_as(astype)) fpr.close() finally: self.delete_file(astype) return readback
def roundtrip(postings, format, astype): with TempStorage("roundtrip") as st: postfile = st.create_file(astype) getweight = format.decoder("weight") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, value in postings: v = format.encode(value) fpw.write(id, getweight(v), v, 0) fpw.finish() fpw.close() postfile = st.open_file(astype) fpr = FilePostingReader(postfile, 0, format) readback = list(fpr.items_as(astype)) postfile.close() return readback
def test_readwrite(): with TempStorage("readwrite") as st: format = Frequency() postings = make_postings() postfile = st.create_file("readwrite") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, freq in postings: fpw.write(id, float(freq), format.encode(freq), 0) fpw.finish() fpw.close() postfile = st.open_file("readwrite") fpr = FilePostingReader(postfile, 0, format) assert_equal(postings, list(fpr.items_as("frequency"))) postfile.close()
def test_readwrite(self): format = Frequency(None) postings = self.make_postings() postfile = self.make_file("readwrite") try: fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, freq in postings: fpw.write(id, format.encode(freq)) fpw.close() postfile = self.open_file("readwrite") fpr = FilePostingReader(postfile, 0, format) #self.assertEqual(postings, list(fpr.items_as("frequency"))) fpr.close() finally: self.delete_file("readwrite")
def test_lowlevel_block_writing(): st = RamStorage() f = st.create_file("postfile") fpw = FilePostingWriter(f, blocklimit=4) fmt = formats.Frequency() fpw.start(fmt) fpw.write(0, 1.0, fmt.encode(1.0), 1) fpw.write(1, 2.0, fmt.encode(2.0), 2) fpw.write(2, 12.0, fmt.encode(12.0), 6) fpw.write(5, 6.5, fmt.encode(6.5), 420) fpw.write(11, 1.5, fmt.encode(1.5), 1) fpw.write(12, 2.5, fmt.encode(2.5), 2) fpw.write(26, 100.5, fmt.encode(100.5), 21) fpw.write(50, 8.0, fmt.encode(8.0), 1020) ti = fpw.finish() assert_equal(ti.weight(), 134.0) assert_equal(ti.doc_frequency(), 8) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), byte_to_length(length_to_byte(1020))) assert_equal(ti.max_weight(), 100.5) assert_equal(ti.max_wol(), 100.5 / byte_to_length(length_to_byte(21)))
def __init__(self, ix, poolclass=None, procs=0, blocklimit=128, timeout=0.0, delay=0.1, name=None, _l=True, **poolargs): self.writelock = None if _l: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError self.readlock = ix.lock("READLOCK") info = ix._read_toc() self.schema = info.schema self.segments = info.segments self.storage = ix.storage self.indexname = ix.indexname self.is_closed = False self.blocklimit = blocklimit self.segment_number = info.segment_counter + 1 self.generation = info.generation + 1 self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() self.name = name or Segment.basename(self.indexname, self.segment_number) self.docnum = 0 self.fieldlength_totals = defaultdict(int) self._added = False self._unique_cache = {} # Create a temporary segment to use its .*_filename attributes segment = Segment(self.name, self.generation, 0, None, None) # Terms index tf = self.storage.create_file(segment.termsindex_filename) ti = TermIndexWriter(tf) # Term postings file pf = self.storage.create_file(segment.termposts_filename) pw = FilePostingWriter(pf, blocklimit=blocklimit) # Terms writer self.termswriter = TermsWriter(self.schema, ti, pw) if self.schema.has_vectored_fields(): # Vector index vf = self.storage.create_file(segment.vectorindex_filename) self.vectorindex = TermVectorWriter(vf) # Vector posting file vpf = self.storage.create_file(segment.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpf, stringids=True) else: self.vectorindex = None self.vpostwriter = None # Stored fields file sf = self.storage.create_file(segment.storedfields_filename) self.storedfields = StoredFieldWriter(sf, self.schema.stored_names()) # Field lengths file self.lengthfile = self.storage.create_file(segment.fieldlengths_filename) # Create the pool if poolclass is None: if procs > 1: from whoosh.filedb.multiproc import MultiPool poolclass = MultiPool else: poolclass = TempfilePool self.pool = poolclass(self.schema, procs=procs, **poolargs)