def to_bytes(self): isinlined = self.is_inlined() # Encode the lengths as 0-255 values minlength = ( 0 if self._minlength is None else length_to_byte(self._minlength) ) maxlength = length_to_byte(self._maxlength) # Convert None values to the out-of-band NO_ID constant so they can be # stored as unsigned ints minid = 0xFFFFFFFF if self._minid is None else self._minid maxid = 0xFFFFFFFF if self._maxid is None else self._maxid # Pack the term info into bytes st = self._struct.pack( isinlined, self._weight, self._df, minlength, maxlength, self._maxweight, minid, maxid, ) if isinlined: # Postings are inlined - dump them using the pickle protocol postbytes = dumps(self._inlined, 2) else: postbytes = pack_long(self._offset) + pack_int(self._length) st += postbytes return st
def _write_block(self, last=False): # Write the buffered block to the postings file # If this is the first block, write a small header first if not self._blockcount: self._postfile.write(WHOOSH3_HEADER_MAGIC) # Add this block's statistics to the terminfo object, which tracks the # overall statistics for all term postings self._terminfo.add_block(self) # Minify the IDs, weights, and values, and put them in a tuple data = (self._mini_ids(), self._mini_weights(), self._mini_values()) # Pickle the tuple databytes = dumps(data) # If the pickle is less than 20 bytes, don't bother compressing if len(databytes) < 20: comp = 0 # Compress the pickle (if self._compression > 0) comp = self._compression if comp: databytes = zlib.compress(databytes, comp) # Make a tuple of block info. The posting reader can check this info # and decide whether to skip the block without having to decompress the # full block data # # - Number of postings in block # - Last ID in block # - Maximum weight in block # - Compression level # - Minimum length byte # - Maximum length byte ids = self._ids infobytes = dumps(( len(ids), ids[-1], self._maxweight, comp, length_to_byte(self._minlength), length_to_byte(self._maxlength), )) # Write block length postfile = self._postfile blocklength = len(infobytes) + len(databytes) if last: # If this is the last block, use a negative number blocklength *= -1 postfile.write_int(blocklength) # Write block info postfile.write(infobytes) # Write block data postfile.write(databytes) self._blockcount += 1 # Reset block buffer self._new_block()
def _write_block(self, last=False): # Write the buffered block to the postings file # If this is the first block, write a small header first if not self._blockcount: self._postfile.write(WHOOSH3_HEADER_MAGIC) # Add this block's statistics to the terminfo object, which tracks the # overall statistics for all term postings self._terminfo.add_block(self) # Minify the IDs, weights, and values, and put them in a tuple data = (self._mini_ids(), self._mini_weights(), self._mini_values()) # Pickle the tuple databytes = dumps(data) # If the pickle is less than 20 bytes, don't bother compressing if len(databytes) < 20: comp = 0 # Compress the pickle (if self._compression > 0) comp = self._compression if comp: databytes = zlib.compress(databytes, comp) # Make a tuple of block info. The posting reader can check this info # and decide whether to skip the block without having to decompress the # full block data # # - Number of postings in block # - Last ID in block # - Maximum weight in block # - Compression level # - Minimum length byte # - Maximum length byte ids = self._ids infobytes = dumps((len(ids), ids[-1], self._maxweight, comp, length_to_byte(self._minlength), length_to_byte(self._maxlength), )) # Write block length postfile = self._postfile blocklength = len(infobytes) + len(databytes) if last: # If this is the last block, use a negative number blocklength *= -1 postfile.write_int(blocklength) # Write block info postfile.write(infobytes) # Write block data postfile.write(databytes) self._blockcount += 1 # Reset block buffer self._new_block()
def add_field(self, fieldname, fieldobj, value, length): if value is not None: self._storedfields[fieldname] = value if length: # Add byte to length column lenfield = _lenfield(fieldname) lb = length_to_byte(length) self.add_column_value(lenfield, LENGTHS_COLUMN, lb) # Add length to total field length self._fieldlengths[fieldname] += length
def to_bytes(self): isinlined = self.is_inlined() # Encode the lengths as 0-255 values minlength = (0 if self._minlength is None else length_to_byte(self._minlength)) maxlength = length_to_byte(self._maxlength) # Convert None values to the out-of-band NO_ID constant so they can be # stored as unsigned ints minid = 0xffffffff if self._minid is None else self._minid maxid = 0xffffffff if self._maxid is None else self._maxid # Pack the term info into bytes st = self._struct.pack(isinlined, self._weight, self._df, minlength, maxlength, self._maxweight, minid, maxid) if isinlined: # Postings are inlined - dump them using the pickle protocol postbytes = dumps(self._inlined, -1) else: postbytes = pack_long(self._offset) + pack_int(self._length) st += postbytes return st
def test_many_lengths(): domain = u("alfa bravo charlie delta echo").split() schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() for i, word in enumerate(domain): length = (i + 1) ** 6 w.add_document(text=" ".join(word for _ in xrange(length))) w.commit() s = ix.searcher() for i, word in enumerate(domain): target = byte_to_length(length_to_byte((i + 1) ** 6)) ti = s.term_info("text", word) assert ti.min_length() == target assert ti.max_length() == target
def test_many_lengths(): domain = u("alfa bravo charlie delta echo").split() schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() for i, word in enumerate(domain): length = (i + 1)**6 w.add_document(text=" ".join(word for _ in xrange(length))) w.commit() s = ix.searcher() for i, word in enumerate(domain): target = byte_to_length(length_to_byte((i + 1)**6)) ti = s.term_info("text", word) assert ti.min_length() == target assert ti.max_length() == target
def test_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "testlengths") as ix: w = ix.writer() items = u("ABCDEFG") from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u(" ").join(islice(cycle(items), length))) w.commit() with ix.reader() as dr: ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))] assert ls1 == [0] * len(lengths) ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))] assert ls2 == [byte_to_length(length_to_byte(l)) for l in lengths]
def test_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "testlengths") as ix: w = ix.writer() items = u("ABCDEFG") from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u(" ").join(islice(cycle(items), length))) w.commit() with ix.reader() as dr: ls1 = [ dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths)) ] assert ls1 == [0] * len(lengths) ls2 = [ dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths)) ] assert ls2 == [byte_to_length(length_to_byte(l)) for l in lengths]
def test_length_byte(): source = list(range(11)) xform = [length_to_byte(n) for n in source] result = [byte_to_length(n) for n in xform] assert source == result
def _byten(n): return byte_to_length(length_to_byte(n))
def _discreet(length): return byte_to_length(length_to_byte(length))