def _read_header(self, dbfile, doccount): first = dbfile.read(4) # Magic assert first == self.magic version = dbfile.read_int() # Version number assert version == 1 dc = dbfile.read_uint() # Number of documents saved if doccount is None: doccount = dc assert dc == doccount, "read=%s argument=%s" % (dc, doccount) self._count = doccount fieldcount = dbfile.read_ushort() # Number of fields # Read per-field info for i in xrange(fieldcount): fieldname = dbfile.read_string().decode('utf-8') self.totals[fieldname] = dbfile.read_long() self.minlens[fieldname] = byte_to_length(dbfile.read_byte()) self.maxlens[fieldname] = byte_to_length(dbfile.read_byte()) self.starts[fieldname] = i * doccount # Add header length to per-field offsets eoh = dbfile.tell() # End of header for fieldname in self.starts: self.starts[fieldname] += eoh
def from_file(file, stringids=False): here = file.tell() encoded_header = file.read(BlockInfo._struct.size) header = BlockInfo._struct.unpack(encoded_header) (flags, _, _, nextoffset, idslen, weightslen, postcount, maxweight, maxwol, _, minlength) = header if not flags: nextoffset = unpack_long(encoded_header[:8]) else: nextoffset = here + nextoffset assert postcount > 0 minlength = byte_to_length(minlength) if stringids: maxid = utf8decode(file.read_string())[0] else: maxid = file.read_uint() dataoffset = file.tell() return BlockInfo(flags=flags, nextoffset=nextoffset, postcount=postcount, maxweight=maxweight, maxwol=maxwol, maxid=maxid, minlength=minlength, dataoffset=dataoffset, idslen=idslen, weightslen=weightslen)
def get(self, docnum, fieldname, default=0): try: arry = self.lengths[fieldname] except KeyError: return default if docnum >= len(arry): return default return byte_to_length(arry[docnum])
def doc_field_length(self, docnum, fieldname, default=0): try: arry = self.lengths[fieldname] except KeyError: return default if docnum >= len(arry): return default return byte_to_length(arry[docnum])
def load_old_lengths(obj, dbfile, doccount): fieldcount = dbfile.read_ushort() # Number of fields for _ in xrange(fieldcount): fieldname = dbfile.read_string().decode("utf-8") obj.lengths[fieldname] = dbfile.read_array("B", doccount) # Old format didn't store totals, so fake it by adding up the codes obj.totals[fieldname] = sum( byte_to_length(b) for b in obj.lengths[fieldname]) dbfile.close()
def load_old_lengths(obj, dbfile, doccount): fieldcount = dbfile.read_ushort() # Number of fields for _ in xrange(fieldcount): fieldname = dbfile.read_string().decode("utf-8") obj.lengths[fieldname] = dbfile.read_array("B", doccount) # Old format didn't store totals, so fake it by adding up the codes obj.totals[fieldname] = sum(byte_to_length(b) for b in obj.lengths[fieldname]) dbfile.close()
def from_file(cls, postfile, postingsize, stringids=False): start = postfile.tell() block = cls(postingsize, stringids=stringids) block.postfile = postfile header = cls._struct.unpack(postfile.read(cls._struct.size)) block.nextoffset = start + header[0] block.cmp = header[1] block.count = header[2] block.idcode = header[3].decode("Latin1") block.idslen = header[5] block.wtslen = header[6] block.maxweight = header[7] block.maxlength = byte_to_length(header[11]) block.minlength = byte_to_length(header[12]) block.maxid = load(postfile) if stringids else postfile.read_uint() block.dataoffset = postfile.tell() return block
def from_file(cls, postfile, postingsize, stringids=False): start = postfile.tell() block = cls(postingsize, stringids=stringids) block.postfile = postfile header = cls._struct.unpack(postfile.read(cls._struct.size)) block.nextoffset = start + header[0] block.cmp = header[1] block.count = header[2] block.idcode = header[3] block.idslen = header[5] block.wtslen = header[6] block.maxweight = header[7] block.maxlength = byte_to_length(header[11]) block.minlength = byte_to_length(header[12]) block.maxid = load(postfile) if stringids else postfile.read_uint() block.dataoffset = postfile.tell() return block
def test_block(): st = RamStorage() f = st.create_file("postfile") b = current(f, 0) b.append(0, 1.0, '', 1) b.append(1, 2.0, '', 2) b.append(2, 12.0, '', 6) b.append(5, 6.5, '', 420) assert b assert_equal(len(b), 4) assert_equal(list(b.ids), [0, 1, 2, 5]) assert_equal(list(b.weights), [1.0, 2.0, 12.0, 6.5]) assert_equal(b.values, None) assert_equal(b.min_length(), 1) assert_equal(b.max_length(), byte_to_length(length_to_byte(420))) assert_equal(b.max_weight(), 12.0) assert_equal(b.max_wol(), 2.0) ti = FileTermInfo() ti.add_block(b) assert_equal(ti.weight(), 21.5) assert_equal(ti.doc_frequency(), 4) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), byte_to_length(length_to_byte(420))) assert_equal(ti.max_weight(), 12.0) assert_equal(ti.max_wol(), 2.0) b.write(compression=3) f.close() f = st.open_file("postfile") bb = current.from_file(f, 0) bb.read_ids() assert_equal(list(bb.ids), [0, 1, 2, 5]) bb.read_weights() assert_equal(list(bb.weights), [1.0, 2.0, 12.0, 6.5]) bb.read_values() assert_equal(b.values, None) assert_equal(bb.min_length(), 1) assert_equal(bb.max_length(), byte_to_length(length_to_byte(420))) assert_equal(bb.max_weight(), 12.0) assert_equal(bb.max_wol(), 2.0)
def from_string(cls, s): assert isinstance(s, bytes_type) if isinstance(s, string_type): hbyte = ord(s[0]) # Python 2.x - str else: hbyte = s[0] # Python 3 - bytes if hbyte < 2: st = cls.struct # Weight, Doc freq, min len, max len, max w, unused, min ID, max ID w, df, ml, xl, xw, _, mid, xid = st.unpack(s[1:st.size + 1]) mid = None if mid == NO_ID else mid xid = None if xid == NO_ID else xid # Postings pstr = s[st.size + 1:] if hbyte == 0: p = unpack_long(pstr)[0] else: p = loads(pstr + b(".")) else: # Old format was encoded as a variable length pickled tuple v = loads(s + b(".")) if len(v) == 1: w = df = 1 p = v[0] elif len(v) == 2: w = df = v[1] p = v[0] else: w, p, df = v # Fake values for stats which weren't stored before ml = 1 xl = 255 xw = 999999999 mid = -1 xid = -1 ml = byte_to_length(ml) xl = byte_to_length(xl) obj = cls(w, df, ml, xl, xw, mid, xid) obj.postings = p return obj
def _minmax(self, fieldname, op, cache): if fieldname in cache: return cache[fieldname] else: ls = self.lengths[fieldname] if ls: result = byte_to_length(op(ls)) else: result = 0 cache[fieldname] = result return result
def test_lowlevel_block_writing(): st = RamStorage() f = st.create_file("postfile") fpw = FilePostingWriter(f, blocklimit=4) fmt = formats.Frequency() fpw.start(fmt) fpw.write(0, 1.0, fmt.encode(1.0), 1) fpw.write(1, 2.0, fmt.encode(2.0), 2) fpw.write(2, 12.0, fmt.encode(12.0), 6) fpw.write(5, 6.5, fmt.encode(6.5), 420) fpw.write(11, 1.5, fmt.encode(1.5), 1) fpw.write(12, 2.5, fmt.encode(2.5), 2) fpw.write(26, 100.5, fmt.encode(100.5), 21) fpw.write(50, 8.0, fmt.encode(8.0), 1020) ti = fpw.finish() assert_equal(ti.weight(), 134.0) assert_equal(ti.doc_frequency(), 8) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), byte_to_length(length_to_byte(1020))) assert_equal(ti.max_weight(), 100.5) assert_equal(ti.max_wol(), 100.5 / byte_to_length(length_to_byte(21)))
def from_file(cls, postfile, postingsize, stringids=False): block = cls(postingsize, stringids=stringids) block.postfile = postfile delta = postfile.read_uint() block.nextoffset = postfile.tell() + delta info = postfile.read_pickle() block.dataoffset = postfile.tell() for key, value in zip(cls.infokeys, info): if key in ("minlength", "maxlength"): value = byte_to_length(value) setattr(block, key, value) return block
def test_many_lengths(): domain = u("alfa bravo charlie delta echo foxtrot golf hotel").split() schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() for i, word in enumerate(domain): length = (i + 1) ** 6 w.add_document(text=" ".join(word for _ in xrange(length))) w.commit() s = ix.searcher() for i, word in enumerate(domain): target = byte_to_length(length_to_byte((i + 1) ** 6)) ti = s.term_info("text", word) assert_equal(ti.min_length(), target) assert_equal(ti.max_length(), target)
def test_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "testlengths") as ix: w = ix.writer() tokens = u("ABCDEFG") from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u(" ").join(islice(cycle(tokens), length))) w.commit() with ix.reader() as dr: ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))] assert_equal(ls1, [0] * len(lengths)) ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))] assert_equal(ls2, [byte_to_length(length_to_byte(l))for l in lengths])
def from_file(cls, postfile, stringids=False): pos = postfile.tell() block = cls(postfile, stringids=stringids) block.postfile = postfile header = cls._struct.unpack(postfile.read(cls._struct.size)) block.nextoffset = pos + header[3] block.idslen = header[4] block.wtslen = header[5] block.count = header[6] block.maxweight = header[7] block.minlength = byte_to_length(header[10]) if stringids: block.maxid = utf8decode(postfile.read_string())[0] else: block.maxid = postfile.read_uint() block.dataoffset = postfile.tell() return block
def add_field_length(self, docnum, fieldname, length): self._fieldlength_totals[fieldname] += length bytelength = length_to_byte(length) normalized = byte_to_length(bytelength) if normalized < self._fieldlength_mins.get(fieldname, 999999999): self._fieldlength_mins[fieldname] = normalized if normalized > self._fieldlength_maxes.get(fieldname, 0): self._fieldlength_maxes[fieldname] = normalized if fieldname not in self.length_arrays: self.length_arrays[fieldname] = array("B") arry = self.length_arrays[fieldname] if len(arry) <= docnum: for _ in xrange(docnum - len(arry) + 1): arry.append(0) arry[docnum] = bytelength
def test_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "testlengths") as ix: w = ix.writer() tokens = u("ABCDEFG") from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u(" ").join(islice(cycle(tokens), length))) w.commit() with ix.reader() as dr: ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))] assert_equal(ls1, [0] * len(lengths)) ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))] assert_equal(ls2, [byte_to_length(length_to_byte(l)) for l in lengths])
def append(self, id, weight, valuestring, dfl): self.ids.append(id) self.weights.append(weight) if weight > self._maxweight: self._maxweight = weight if valuestring: if self.values is None: self.values = [] self.values.append(valuestring) if dfl: length_byte = length_to_byte(dfl) if self._minlength is None or length_byte < self._minlength: self._minlength = length_byte if dfl > self._maxlength: self._maxlength = length_byte wol = weight / byte_to_length(length_byte) if wol > self._maxwol: self._maxwol = wol
def from_file(cls, postfile, stringids=False): start = postfile.tell() block = cls(postfile, stringids=stringids) header = cls._struct.unpack(postfile.read(cls._struct.size)) block.nextoffset = start + header[0] block.compression = header[1] block.postcount = header[2] block.typecode = header[3] block.idslen = header[5] block.weightslen = header[6] block.maxweight = header[7] block.maxwol = header[8] block.minlen = byte_to_length(header[10]) if stringids: block.maxid = load(postfile) else: block.maxid = postfile.read_uint() block.dataoffset = postfile.tell() return block
def from_file(cls, postfile, stringids=False): pos = postfile.tell() block = cls(postfile, stringids=stringids) encoded_header = postfile.read(cls._struct.size) header = cls._struct.unpack(encoded_header) (flags, _, _, nextoffset, block.idslen, block.weightslen, block.postcount, block.maxweight, block.maxwol, _, minlength) = header block.nextoffset = pos + nextoffset block.minlength = byte_to_length(minlength) assert block.postcount > 0, "postcount=%r" % block.postcount if stringids: block.maxid = utf8decode(postfile.read_string())[0] else: block.maxid = postfile.read_uint() block.dataoffset = postfile.tell() return block
def blen(n): return byte_to_length(length_to_byte(n))
def test_length_byte(): source = list(range(11)) xform = [length_to_byte(n) for n in source] result = [byte_to_length(n) for n in xform] assert_equal(source, result)
def get(self, docnum, fieldname, default=0): lengths = self.lengths if fieldname not in lengths: return default byte = lengths[fieldname][docnum] or default return byte_to_length(byte)
def read_min_and_max_length(cls, dbfile, datapos): lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE ml = byte_to_length(dbfile.get_byte(lenpos)) xl = byte_to_length(dbfile.get_byte(lenpos + 1)) return ml, xl
def min_length(self): return byte_to_length(self._minlength)
def max_length(self): return byte_to_length(self._maxlength)
def doc_field_length(self, docnum, fieldname, default=0): try: start = self.starts[fieldname] except KeyError: return default return byte_to_length(self.dbfile.get_byte(start + docnum))
def get(self, docnum, fieldname, default=0): try: start = self.starts[fieldname] except KeyError: return default return byte_to_length(self.dbfile.get_byte(start + docnum))
def _discreet(length): return byte_to_length(length_to_byte(length))