def encode(self, poslist): deltas = [] base = 0 for pos in poslist: deltas.append(pos - base) base = pos return pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
def encode(self, positions): codes = [] base = 0 for pos in positions: codes.append(pos - base) base = pos return pack_uint(len(codes)) + dumps(codes, -1)[2:-1]
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) kwargs["positions"] = True kwargs["chars"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost)) for w, poses in iteritems(seen): # posns_chars_boosts = [(pos, startchar, endchar, boost), ...] codes = [] posbase = 0 charbase = 0 summedboost = 0 for pos, startchar, endchar, boost in poses: codes.append((pos - posbase, startchar - charbase, endchar - startchar, boost)) posbase = pos charbase = endchar summedboost += boost value = (pack_uint(len(poses)) + pack_float(summedboost * fb) + dumps(codes, -1)[2:-1]) yield (w, len(poses), summedboost * fb, value)
def encode(self, positions): # positions = [pos1, pos2, ...] codes = [] base = 0 for pos in positions: codes.append(varint(pos - base)) base = pos return pack_uint(len(positions)) + "".join(codes)
def encode(self, posns_chars): # posns_chars = [(pos, startchar, endchar), ...] codes = [] posbase = 0 charbase = 0 for pos, startchar, endchar in posns_chars: codes.append((pos - posbase, startchar - charbase, endchar - startchar)) posbase = pos charbase = endchar return pack_uint(len(posns_chars)) + dumps(codes, -1)[2:-1]
def encode(self, poslist): deltas = [] posbase = 0 charbase = 0 for pos, startchar, endchar in poslist: deltas.append( (pos - posbase, startchar - charbase, endchar - startchar)) posbase = pos charbase = endchar return pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
def encode(self, poses): codes = [] base = 0 summedboost = 0 for pos, boost in poses: summedboost += boost codes.append((pos - base, boost)) base = pos return (pack_uint(len(poses)) + pack_float(summedboost) + dumps(codes, -1)[2:-1])
def encode(self, poslist): deltas = [] posbase = 0 charbase = 0 for pos, startchar, endchar in poslist: deltas.append((pos - posbase, startchar - charbase, endchar - startchar)) posbase = pos charbase = endchar return pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
def encode(self, posns_chars): # posns_chars = [(pos, startchar, endchar), ...] codes = [] posbase = 0 charbase = 0 for pos, startchar, endchar in posns_chars: codes.append(varint(pos - posbase)) posbase = pos codes.extend( (varint(startchar - charbase), varint(endchar - startchar))) charbase = endchar return pack_uint(len(posns_chars)) + "".join(codes)
def encode(self, posns_chars): # posns_chars = [(pos, startchar, endchar), ...] codes = [] posbase = 0 charbase = 0 for pos, startchar, endchar in posns_chars: codes.append(varint(pos - posbase)) posbase = pos codes.extend((varint(startchar - charbase), varint(endchar - startchar))) charbase = endchar return pack_uint(len(posns_chars)) + "".join(codes)
def valuecoder(self, data): # Encode term info w, offset, df = data if offset < _4GB: iw = int(w) if w == 1 and df == 1: return pack_uint(offset) elif w == iw and w <= 255 and df <= 255: return _terminfo_struct0.pack(iw, offset, df) else: return _terminfo_struct1.pack(w, offset, df) else: return _terminfo_struct2.pack(w, offset, df)
def write_tagint(self, i): """Writes a sometimes-compressed unsigned integer to the wrapped file. This is similar to the varint methods but uses a less compressed but faster format. """ # Store numbers 0-253 in one byte. Byte 254 means "an unsigned 16-bit # int follows." Byte 255 means "An unsigned 32-bit int follows." if i <= 253: self.file.write(chr(i)) elif i <= 65535: self.file.write("\xFE" + pack_ushort(i)) else: self.file.write("\xFF" + pack_uint(i))
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost length = 0 freqs = defaultdict(int) weights = defaultdict(float) kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): length += 1 freqs[t.text] += 1 weights[t.text] += t.boost wvs = ((w, freq, weights[w] * fb, pack_uint(freq)) for w, freq in iteritems(freqs)) return wvs
def encode(self, posns_chars_boosts): # posns_chars_boosts = [(pos, startchar, endchar, boost), ...] codes = [] posbase = 0 charbase = 0 summedboost = 0 for pos, startchar, endchar, boost in posns_chars_boosts: codes.append((pos - posbase, startchar - charbase, endchar - startchar, boost)) posbase = pos charbase = endchar summedboost += boost return (pack_uint(len(posns_chars_boosts)) + pack_float(summedboost) + dumps(codes, -1)[2:-1])
def encode(self, poses): fb = self.field_boost # posns_chars_boosts = [(pos, startchar, endchar, boost), ...] codes = [] posbase = 0 charbase = 0 summedboost = 0 for pos, startchar, endchar, boost in poses: codes.append((pos - posbase, startchar - charbase, endchar - startchar, boost)) posbase = pos charbase = endchar summedboost += boost return ((pack_uint(len(poses)) + pack_float(summedboost * fb) + dumps(codes, 2)), summedboost)
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost poses = defaultdict(list) weights = defaultdict(float) kwargs["positions"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): poses[t.text].append(t.pos) weights[t.text] += t.boost for w, poslist in iteritems(poses): deltas = [] base = 0 for pos in poslist: deltas.append(pos - base) base = pos value = pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1] yield (w, len(poslist), weights[w] * fb, value)
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) kwargs["positions"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): pos = t.pos boost = t.boost seen[t.text].append((pos, boost)) for w, poses in iteritems(seen): codes = [] base = 0 summedboost = 0 for pos, boost in poses: summedboost += boost codes.append((pos - base, boost)) base = pos value = (pack_uint(len(poses)) + pack_float(summedboost) + dumps(codes, -1)[2:-1]) yield (w, len(poses), sum(p[1] for p in poses) * fb, value)
def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) weights = defaultdict(float) kwargs["positions"] = True kwargs["chars"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): seen[t.text].append((t.pos, t.startchar, t.endchar)) weights[t.text] += t.boost for w, poslist in iteritems(seen): deltas = [] posbase = 0 charbase = 0 for pos, startchar, endchar in poslist: deltas.append((pos - posbase, startchar - charbase, endchar - startchar)) posbase = pos charbase = endchar value = pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1] yield (w, len(poslist), weights[w] * fb, value)
def encode(self, freq): # frequency needs to be an int freq = int(freq) return pack_uint(freq)
def combine(self, vs): return pack_uint(sum(self.decode_value(v) for v in vs))
def write(self, compression=3): postfile = self.postfile stringids = self.stringids ids = self.ids weights = self.weights values = self.values postcount = len(ids) if postcount <= 4 or not can_compress: compression = 0 # Max ID maxid = ids[-1] if stringids: maxid_string = dumps(maxid, -1)[2:] else: maxid_string = pack_uint(maxid) # IDs typecode = "I" if stringids: ids_string = dumps(ids, -1)[2:] typecode = "s" else: if maxid <= 255: typecode = "B" elif maxid <= 65535: typecode = "H" if typecode != ids.typecode: ids = array(typecode, iter(ids)) if not IS_LITTLE: ids.byteswap() ids_string = ids.tostring() if compression: ids_string = compress(ids_string, compression) # Weights if all(w == 1.0 for w in weights): weights_string = b('') else: if not IS_LITTLE: weights.byteswap() weights_string = weights.tostring() if weights_string and compression: weights_string = compress(weights_string, compression) # Values postingsize = self.postingsize if postingsize < 0: values_string = dumps(values, -1)[2:] elif postingsize == 0: values_string = b('') else: values_string = b("").join(values) if values_string and compression: values_string = compress(values_string, compression) # Header flags = 1 if compression else 0 blocksize = sum((self._struct.size, len(maxid_string), len(ids_string), len(weights_string), len(values_string))) header = self._struct.pack(blocksize, flags, postcount, typecode.encode('latin-1'), 0, len(ids_string), len(weights_string), self.max_weight(), self.max_wol(), 0, 0, self._maxlength, self._minlength or 0) postfile.write(header) postfile.write(maxid_string) postfile.write(ids_string) postfile.write(weights_string) postfile.write(values_string)
def to_file(self, postfile, posting_size, compression=3): stringids = self.stringids ids = self.ids weights = self.weights values = self.values postcount = len(ids) maxweight, maxwol, minlength = self.stats() if postcount <= 4 or not can_compress: compression = 0 # Max ID maxid = ids[-1] if stringids: maxid_string = dumps(maxid, -1)[2:] else: maxid_string = pack_uint(maxid) # IDs typecode = "I" if stringids: ids_string = dumps(ids, -1)[2:] typecode = "s" else: if maxid <= 255: typecode = "B" elif maxid <= 65535: typecode = "H" if typecode != ids.typecode: ids = array(typecode, ids) if not IS_LITTLE: ids.byteswap() ids_string = ids.tostring() if compression: ids_string = compress(ids_string, compression) # Weights if all(w == 1.0 for w in weights): weights_string = '' else: if not IS_LITTLE: weights.byteswap() weights_string = weights.tostring() if weights_string and compression: weights_string = compress(weights_string, compression) # Values if posting_size < 0: values_string = dumps(values, -1)[2:] elif posting_size == 0: values_string = '' else: values_string = "".join(values) if values_string and compression: values_string = compress(values_string, compression) # Header flags = 1 if compression else 0 minlen_byte = length_to_byte(minlength) blocksize = sum((self._struct.size, len(maxid_string), len(ids_string), len(weights_string), len(values_string))) header = self._struct.pack(blocksize, flags, postcount, typecode, 0, len(ids_string), len(weights_string), maxweight, maxwol, 0, minlen_byte) postfile.write(header) postfile.write(maxid_string) postfile.write(ids_string) postfile.write(weights_string) postfile.write(values_string)
def encode(self, freq_docboost): freq, docboost = freq_docboost return pack_uint(freq) + float_to_byte(docboost)
def _write_node(self, uncnode): vtype = self.vtype dbfile = self.dbfile arcs = uncnode.arcs numarcs = len(arcs) if not numarcs: if uncnode.accept: return None else: # What does it mean for an arc to stop but not be accepted? raise Exception self.node_count += 1 buf = StructFile(BytesIO()) nodestart = dbfile.tell() #self.count += 1 #self.arccount += numarcs fixedsize = -1 arcstart = buf.tell() for i, arc in enumerate(arcs): self.arc_count += 1 target = arc.target label = arc.label flags = 0 if len(label) > 1: flags += MULTIBYTE_LABEL if i == numarcs - 1: flags += ARC_LAST if arc.accept: flags += ARC_ACCEPT if target is None: flags += ARC_STOP if arc.value is not None: flags += ARC_HAS_VAL if arc.acceptval is not None: flags += ARC_HAS_ACCEPT_VAL buf.write(pack_byte(flags)) if len(label) > 1: buf.write(varint(len(label))) buf.write(label) if target is not None: buf.write(pack_uint(target)) if arc.value is not None: vtype.write(buf, arc.value) if arc.acceptval is not None: vtype.write(buf, arc.acceptval) here = buf.tell() thissize = here - arcstart arcstart = here if fixedsize == -1: fixedsize = thissize elif fixedsize > 0 and thissize != fixedsize: fixedsize = 0 if fixedsize > 0: # Write a fake arc containing the fixed size and number of arcs dbfile.write_byte(255) # FIXED_SIZE dbfile.write_int(fixedsize) dbfile.write_int(numarcs) self.fixed_count += 1 dbfile.write(buf.file.getvalue()) return nodestart
def write_uint(self, n): self.file.write(pack_uint(n))
def encode(self, freq): return pack_uint(freq)