def from_bytes(cls, s): st = cls._struct vals = st.unpack(s[:st.size]) terminfo = cls() flags = vals[0] terminfo._weight = vals[1] terminfo._df = vals[2] terminfo._minlength = byte_to_length(vals[3]) terminfo._maxlength = byte_to_length(vals[4]) terminfo._maxweight = vals[5] terminfo._minid = None if vals[6] == 0xffffffff else vals[6] terminfo._maxid = None if vals[7] == 0xffffffff else vals[7] if flags: # Postings are stored inline terminfo._inlined = loads(s[st.size:]) else: # Last bytes are pointer into posting file and length offpos = st.size lenpos = st.size + _LONG_SIZE terminfo._offset = unpack_long(s[offpos:lenpos])[0] terminfo._length = unpack_int(s[lenpos:lenpos + _INT_SIZE]) return terminfo
def _goto(self, position): # Read the posting block at the given position postfile = self._postfile # Reset block data -- we'll lazy load the data from the new block as # needed self._data = None self._ids = None self._weights = None self._values = None # Reset pointer into the block self._i = 0 # Seek to the start of the block postfile.seek(position) # Read the block length length = postfile.read_int() # If the block length is negative, that means this is the last block if length < 0: self._lastblock = True length *= -1 # Remember the offset of the next block self._nextoffset = position + _INT_SIZE + length # Read the pickled block info tuple info = postfile.read_pickle() # Remember the offset of the block's data self._dataoffset = postfile.tell() # Decompose the info tuple to set the current block info (self._blocklength, self._maxid, self._maxweight, self._compression, mnlen, mxlen) = info self._minlength = byte_to_length(mnlen) self._maxlength = byte_to_length(mxlen)
def _minmax_length(self, fieldname, op, cache): if fieldname in cache: return cache[fieldname] lenfield = _lenfield(fieldname) reader = self._cached_reader(lenfield, LENGTHS_COLUMN) length = byte_to_length(op(reader)) cache[fieldname] = length return length
def doc_field_length(self, docnum, fieldname, default=0): if docnum > self._doccount: raise IndexError("Asked for docnum %r of %d" % (docnum, self._doccount)) lenfield = _lenfield(fieldname) reader = self._cached_reader(lenfield, LENGTHS_COLUMN) if reader is None: return default lbyte = reader[docnum] if lbyte: return byte_to_length(lbyte)
def test_many_lengths(): domain = u("alfa bravo charlie delta echo").split() schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() for i, word in enumerate(domain): length = (i + 1) ** 6 w.add_document(text=" ".join(word for _ in xrange(length))) w.commit() s = ix.searcher() for i, word in enumerate(domain): target = byte_to_length(length_to_byte((i + 1) ** 6)) ti = s.term_info("text", word) assert ti.min_length() == target assert ti.max_length() == target
def test_many_lengths(): domain = u("alfa bravo charlie delta echo").split() schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() for i, word in enumerate(domain): length = (i + 1)**6 w.add_document(text=" ".join(word for _ in xrange(length))) w.commit() s = ix.searcher() for i, word in enumerate(domain): target = byte_to_length(length_to_byte((i + 1)**6)) ti = s.term_info("text", word) assert ti.min_length() == target assert ti.max_length() == target
def test_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "testlengths") as ix: w = ix.writer() items = u("ABCDEFG") from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u(" ").join(islice(cycle(items), length))) w.commit() with ix.reader() as dr: ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))] assert ls1 == [0] * len(lengths) ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))] assert ls2 == [byte_to_length(length_to_byte(l)) for l in lengths]
def test_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "testlengths") as ix: w = ix.writer() items = u("ABCDEFG") from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u(" ").join(islice(cycle(items), length))) w.commit() with ix.reader() as dr: ls1 = [ dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths)) ] assert ls1 == [0] * len(lengths) ls2 = [ dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths)) ] assert ls2 == [byte_to_length(length_to_byte(l)) for l in lengths]
def read_min_and_max_length(cls, dbfile, datapos): lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE ml = byte_to_length(dbfile.get_byte(lenpos)) xl = byte_to_length(dbfile.get_byte(lenpos + 1)) return ml, xl
def _byten(n): return byte_to_length(length_to_byte(n))
def test_length_byte(): source = list(range(11)) xform = [length_to_byte(n) for n in source] result = [byte_to_length(n) for n in xform] assert source == result
def _discreet(length): return byte_to_length(length_to_byte(length))