def test_block(): st = RamStorage() f = st.create_file("postfile") b = current(f, 0) b.append(0, 1.0, '', 1) b.append(1, 2.0, '', 2) b.append(2, 12.0, '', 6) b.append(5, 6.5, '', 420) assert b assert_equal(len(b), 4) assert_equal(list(b.ids), [0, 1, 2, 5]) assert_equal(list(b.weights), [1.0, 2.0, 12.0, 6.5]) assert_equal(b.values, None) assert_equal(b.min_length(), 1) assert_equal(b.max_length(), byte_to_length(length_to_byte(420))) assert_equal(b.max_weight(), 12.0) assert_equal(b.max_wol(), 2.0) ti = FileTermInfo() ti.add_block(b) assert_equal(ti.weight(), 21.5) assert_equal(ti.doc_frequency(), 4) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), byte_to_length(length_to_byte(420))) assert_equal(ti.max_weight(), 12.0) assert_equal(ti.max_wol(), 2.0) b.write(compression=3) f.close() f = st.open_file("postfile") bb = current.from_file(f, 0) bb.read_ids() assert_equal(list(bb.ids), [0, 1, 2, 5]) bb.read_weights() assert_equal(list(bb.weights), [1.0, 2.0, 12.0, 6.5]) bb.read_values() assert_equal(b.values, None) assert_equal(bb.min_length(), 1) assert_equal(bb.max_length(), byte_to_length(length_to_byte(420))) assert_equal(bb.max_weight(), 12.0) assert_equal(bb.max_wol(), 2.0)
class FilePostingWriter(PostingWriter): blockclass = postblocks.current def __init__(self, postfile, stringids=False, blocklimit=128, compression=3): self.postfile = postfile self.stringids = stringids if blocklimit > 255: raise ValueError("blocklimit argument must be <= 255") elif blocklimit < 1: raise ValueError("blocklimit argument must be > 0") self.blocklimit = blocklimit self.compression = compression self.block = None def _reset_block(self): self.block = self.blockclass(self.postfile, self.format.posting_size, stringids=self.stringids) def start(self, format): if self.block is not None: raise Exception("Called start() in a block") self.format = format self.blockcount = 0 self.startoffset = self.postfile.tell() self.terminfo = FileTermInfo() # Magic number self.postfile.write_int(self.blockclass.magic) # Placeholder for block count self.postfile.write_uint(0) self._reset_block() return self.startoffset def write(self, id, weight, valuestring, dfl): self.block.append(id, weight, valuestring, dfl) if len(self.block) >= self.blocklimit: self._write_block() def finish(self, inlinelimit=1): assert isinstance(inlinelimit, integer_types) if self.block is None: raise Exception("Called finish() when not in a block") block = self.block terminfo = self.terminfo if self.blockcount < 1 and block and len(block) <= inlinelimit: terminfo.add_block(block) vals = None if not block.values else tuple(block.values) postings = (tuple(block.ids), tuple(block.weights), vals) else: if block: self._write_block() # Seek back to the start of this list of posting blocks and write # the number of blocks pf = self.postfile pf.flush() offset = pf.tell() pf.seek(self.startoffset + _INT_SIZE) pf.write_uint(self.blockcount) pf.seek(offset) postings = self.startoffset self.block = None terminfo.postings = postings return terminfo def close(self): if self.block: raise Exception("Closed posting writer without finishing") self.postfile.close() def block_stats(self): return self.block.stats() def _write_block(self): self.block.write(compression=self.compression) self.terminfo.add_block(self.block) self._reset_block() self.blockcount += 1