def to_string(self): # Encode the lengths as 0-255 values ml = 0 if self._minlength is None else length_to_byte(self._minlength) xl = length_to_byte(self._maxlength) # Convert None values to the out-of-band NO_ID constant so they can be # stored as unsigned ints mid = NO_ID if self._minid is None else self._minid xid = NO_ID if self._maxid is None else self._maxid # Pack the term info into bytes st = self.struct.pack(self._weight, self._df, ml, xl, self._maxweight, 0, mid, xid) if isinstance(self.postings, tuple): # Postings are inlined - dump them using the pickle protocol isinlined = 1 st += dumps(self.postings, -1)[2:-1] else: # Append postings pointer as long to end of term info bytes isinlined = 0 # It's possible for a term info to not have a pointer to postings # on disk, in which case postings will be None. Convert a None # value to -1 so it can be stored as a long. p = -1 if self.postings is None else self.postings st += pack_long(p) # Prepend byte indicating whether the postings are inlined to the term # info bytes return pack("B", isinlined) + st
def to_file(self, file, stringids=False): flags = 1 self._blockstart = file.tell() self._pointer_pos = self._blockstart + 4 file.write( self._struct.pack( flags, 0, 0, # unused B, H self.nextoffset, self.idslen, self.weightslen, self.postcount, self.maxweight, self.maxwol, 0, length_to_byte(self.minlength))) # Write the maximum ID after the header. We have to do this # separately because it might be a string (in the case of a vector) if stringids: file.write_string(utf8encode(self.maxid)[0]) else: file.write_uint(self.maxid)
def test_block(): st = RamStorage() f = st.create_file("postfile") b = current(f, 0) b.append(0, 1.0, '', 1) b.append(1, 2.0, '', 2) b.append(2, 12.0, '', 6) b.append(5, 6.5, '', 420) assert b assert_equal(len(b), 4) assert_equal(list(b.ids), [0, 1, 2, 5]) assert_equal(list(b.weights), [1.0, 2.0, 12.0, 6.5]) assert_equal(b.values, None) assert_equal(b.min_length(), 1) assert_equal(b.max_length(), byte_to_length(length_to_byte(420))) assert_equal(b.max_weight(), 12.0) assert_equal(b.max_wol(), 2.0) ti = FileTermInfo() ti.add_block(b) assert_equal(ti.weight(), 21.5) assert_equal(ti.doc_frequency(), 4) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), byte_to_length(length_to_byte(420))) assert_equal(ti.max_weight(), 12.0) assert_equal(ti.max_wol(), 2.0) b.write(compression=3) f.close() f = st.open_file("postfile") bb = current.from_file(f, 0) bb.read_ids() assert_equal(list(bb.ids), [0, 1, 2, 5]) bb.read_weights() assert_equal(list(bb.weights), [1.0, 2.0, 12.0, 6.5]) bb.read_values() assert_equal(b.values, None) assert_equal(bb.min_length(), 1) assert_equal(bb.max_length(), byte_to_length(length_to_byte(420))) assert_equal(bb.max_weight(), 12.0) assert_equal(bb.max_wol(), 2.0)
def to_file(self, postfile, compression=3): ids = self.ids idcode, idstring = minimize_ids(ids, self.stringids, compression) wtstring = minimize_weights(self.weights, compression) vstring = minimize_values(self.postingsize, self.values, compression) info = (len(ids), ids[-1], self.maxweight, length_to_byte(self.minlength), length_to_byte(self.maxlength), idcode, compression, len(idstring), len(wtstring)) infostring = dumps(info, -1) # Offset to next block postfile.write_uint( len(infostring) + len(idstring) + len(wtstring) + len(vstring)) # Block contents postfile.write(infostring) postfile.write(idstring) postfile.write(wtstring) postfile.write(vstring)
def add_block(self, block): self._weight += sum(block.weights) self._df += len(block) ml = length_to_byte(block.min_length()) if self._minlength is None: self._minlength = ml else: self._minlength = min(self._minlength, ml) xl = length_to_byte(block.max_length()) self._maxlength = max(self._maxlength, xl) self._maxweight = max(self._maxweight, block.max_weight()) self._maxwol = max(self._maxwol, block.max_wol()) if self._minid is None: self._minid = block.ids[0] self._maxid = block.ids[-1]
def to_file(self, postfile, compression=3): ids = self.ids idcode, idstring = minimize_ids(ids, self.stringids, compression) wtstring = minimize_weights(self.weights, compression) vstring = minimize_values(self.postingsize, self.values, compression) info = (len(ids), ids[-1], self.maxweight, length_to_byte(self.minlength), length_to_byte(self.maxlength), idcode, compression, len(idstring), len(wtstring)) infostring = dumps(info, -1) # Offset to next block postfile.write_uint(len(infostring) + len(idstring) + len(wtstring) + len(vstring)) # Block contents postfile.write(infostring) postfile.write(idstring) postfile.write(wtstring) postfile.write(vstring)
def to_file(self, dbfile, doccount): self._pad_arrays(doccount) fieldnames = list(self.lengths.keys()) dbfile.write(self.magic) dbfile.write_int(1) # Format version number dbfile.write_uint(doccount) # Number of documents dbfile.write_ushort(len(self.lengths)) # Number of fields # Write per-field info for fieldname in fieldnames: dbfile.write_string(fieldname.encode('utf-8')) # Fieldname dbfile.write_long(self.field_length(fieldname)) dbfile.write_byte(length_to_byte(self.min_field_length(fieldname))) dbfile.write_byte(length_to_byte(self.max_field_length(fieldname))) # Write byte arrays for fieldname in fieldnames: dbfile.write_array(self.lengths[fieldname]) dbfile.close()
def add_field_length(self, docnum, fieldname, length): self._fieldlength_totals[fieldname] += length if length > self._fieldlength_maxes.get(fieldname, 0): self._fieldlength_maxes[fieldname] = length if fieldname not in self.length_arrays: self.length_arrays[fieldname] = array("B") arry = self.length_arrays[fieldname] if len(arry) <= docnum: for _ in xrange(docnum - len(arry) + 1): arry.append(0) arry[docnum] = length_to_byte(length)
def test_lowlevel_block_writing(): st = RamStorage() f = st.create_file("postfile") fpw = FilePostingWriter(f, blocklimit=4) fmt = formats.Frequency() fpw.start(fmt) fpw.write(0, 1.0, fmt.encode(1.0), 1) fpw.write(1, 2.0, fmt.encode(2.0), 2) fpw.write(2, 12.0, fmt.encode(12.0), 6) fpw.write(5, 6.5, fmt.encode(6.5), 420) fpw.write(11, 1.5, fmt.encode(1.5), 1) fpw.write(12, 2.5, fmt.encode(2.5), 2) fpw.write(26, 100.5, fmt.encode(100.5), 21) fpw.write(50, 8.0, fmt.encode(8.0), 1020) ti = fpw.finish() assert_equal(ti.weight(), 134.0) assert_equal(ti.doc_frequency(), 8) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), byte_to_length(length_to_byte(1020))) assert_equal(ti.max_weight(), 100.5) assert_equal(ti.max_wol(), 100.5 / byte_to_length(length_to_byte(21)))
def add(self, docnum, fieldname, length): lengths = self.lengths if length: if fieldname not in lengths: self._create_field(fieldname, docnum) arry = self.lengths[fieldname] count = docnum + 1 if len(arry) < count: for _ in xrange(count - len(arry)): arry.append(0) if count > self._count: self._count = count byte = length_to_byte(length) arry[docnum] = byte self.totals[fieldname] += length
def test_many_lengths(): domain = u("alfa bravo charlie delta echo foxtrot golf hotel").split() schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() for i, word in enumerate(domain): length = (i + 1) ** 6 w.add_document(text=" ".join(word for _ in xrange(length))) w.commit() s = ix.searcher() for i, word in enumerate(domain): target = byte_to_length(length_to_byte((i + 1) ** 6)) ti = s.term_info("text", word) assert_equal(ti.min_length(), target) assert_equal(ti.max_length(), target)
def test_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "testlengths") as ix: w = ix.writer() tokens = u("ABCDEFG") from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u(" ").join(islice(cycle(tokens), length))) w.commit() with ix.reader() as dr: ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))] assert_equal(ls1, [0] * len(lengths)) ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))] assert_equal(ls2, [byte_to_length(length_to_byte(l))for l in lengths])
def test_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "testlengths") as ix: w = ix.writer() tokens = u("ABCDEFG") from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u(" ").join(islice(cycle(tokens), length))) w.commit() with ix.reader() as dr: ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))] assert_equal(ls1, [0] * len(lengths)) ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))] assert_equal(ls2, [byte_to_length(length_to_byte(l)) for l in lengths])
def to_file(self, file, stringids=False): flags = 1 self._blockstart = file.tell() self._pointer_pos = self._blockstart + 4 file.write(self._struct.pack(flags, 0, 0, # unused B, H self.nextoffset, self.idslen, self.weightslen, self.postcount, self.maxweight, self.maxwol, 0, length_to_byte(self.minlength))) # Write the maximum ID after the header. We have to do this # separately because it might be a string (in the case of a vector) if stringids: file.write_string(utf8encode(self.maxid)[0]) else: file.write_uint(self.maxid)
def add_field_length(self, docnum, fieldname, length): self._fieldlength_totals[fieldname] += length bytelength = length_to_byte(length) normalized = byte_to_length(bytelength) if normalized < self._fieldlength_mins.get(fieldname, 999999999): self._fieldlength_mins[fieldname] = normalized if normalized > self._fieldlength_maxes.get(fieldname, 0): self._fieldlength_maxes[fieldname] = normalized if fieldname not in self.length_arrays: self.length_arrays[fieldname] = array("B") arry = self.length_arrays[fieldname] if len(arry) <= docnum: for _ in xrange(docnum - len(arry) + 1): arry.append(0) arry[docnum] = bytelength
def append(self, id, weight, valuestring, dfl): self.ids.append(id) self.weights.append(weight) if weight > self._maxweight: self._maxweight = weight if valuestring: if self.values is None: self.values = [] self.values.append(valuestring) if dfl: length_byte = length_to_byte(dfl) if self._minlength is None or length_byte < self._minlength: self._minlength = length_byte if dfl > self._maxlength: self._maxlength = length_byte wol = weight / byte_to_length(length_byte) if wol > self._maxwol: self._maxwol = wol
def blen(n): return byte_to_length(length_to_byte(n))
def test_length_byte(): source = list(range(11)) xform = [length_to_byte(n) for n in source] result = [byte_to_length(n) for n in xform] assert_equal(source, result)
def to_file(self, postfile, posting_size, compression=3): stringids = self.stringids ids = self.ids weights = self.weights values = self.values postcount = len(ids) maxweight, maxwol, minlength = self.stats() if postcount <= 4 or not can_compress: compression = 0 # Max ID maxid = ids[-1] if stringids: maxid_string = dumps(maxid, -1)[2:] else: maxid_string = pack_uint(maxid) # IDs typecode = "I" if stringids: ids_string = dumps(ids, -1)[2:] typecode = "s" else: if maxid <= 255: typecode = "B" elif maxid <= 65535: typecode = "H" if typecode != ids.typecode: ids = array(typecode, ids) if not IS_LITTLE: ids.byteswap() ids_string = ids.tostring() if compression: ids_string = compress(ids_string, compression) # Weights if all(w == 1.0 for w in weights): weights_string = '' else: if not IS_LITTLE: weights.byteswap() weights_string = weights.tostring() if weights_string and compression: weights_string = compress(weights_string, compression) # Values if posting_size < 0: values_string = dumps(values, -1)[2:] elif posting_size == 0: values_string = '' else: values_string = "".join(values) if values_string and compression: values_string = compress(values_string, compression) # Header flags = 1 if compression else 0 minlen_byte = length_to_byte(minlength) blocksize = sum((self._struct.size, len(maxid_string), len(ids_string), len(weights_string), len(values_string))) header = self._struct.pack(blocksize, flags, postcount, typecode, 0, len(ids_string), len(weights_string), maxweight, maxwol, 0, minlen_byte) postfile.write(header) postfile.write(maxid_string) postfile.write(ids_string) postfile.write(weights_string) postfile.write(values_string)
def _discreet(length): return byte_to_length(length_to_byte(length))