class FieldedOrderedHashWriter(HashWriter): """Implements an on-disk hash, but writes separate position indexes for each field. """ def __init__(self, dbfile): HashWriter.__init__(self, dbfile) # Map field names to (startpos, indexpos, length, typecode) self.fieldmap = self.extras["fieldmap"] = {} # Keep track of the last key added self.lastkey = emptybytes def start_field(self, fieldname): self.fieldstart = self.dbfile.tell() self.fieldname = fieldname # Keep an array of the positions of all keys self.poses = GrowableArray("H") self.lastkey = emptybytes def add(self, key, value): if key <= self.lastkey: raise ValueError("Keys must increase: %r..%r" % (self.lastkey, key)) self.poses.append(self.dbfile.tell() - self.fieldstart) HashWriter.add(self, key, value) self.lastkey = key def end_field(self): dbfile = self.dbfile fieldname = self.fieldname poses = self.poses self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses), poses.typecode) poses.to_file(dbfile)
class Writer(ColumnWriter): def __init__(self, dbfile): assert isinstance(dbfile, StructFile) self._dbfile = dbfile self._count = 0 self._lengths = GrowableArray(allow_longs=False) def __repr__(self): return "<VarBytes.Writer>" def fill(self, docnum): if docnum > self._count: self._lengths.extend(0 for _ in xrange(docnum - self._count)) def add(self, docnum, v): self.fill(docnum) self._dbfile.write(v) self._lengths.append(len(v)) self._count = docnum + 1 def finish(self, doccount): self.fill(doccount) lengths = self._lengths.array self._dbfile.write_array(lengths) # Write the typecode for the lengths self._dbfile.write_byte(ord(lengths.typecode))
class OrderedHashWriter(HashWriter): """Implements an on-disk hash, but requires that keys be added in order. An :class:`OrderedHashReader` can then look up "nearest keys" based on the ordering. """ def __init__(self, dbfile): HashWriter.__init__(self, dbfile) # Keep an array of the positions of all keys self.index = GrowableArray("H") # Keep track of the last key added self.lastkey = emptybytes def add(self, key, value): if key <= self.lastkey: raise ValueError("Keys must increase: %r..%r" % (self.lastkey, key)) self.index.append(self.dbfile.tell()) HashWriter.add(self, key, value) self.lastkey = key def _write_extras(self): dbfile = self.dbfile index = self.index # Store metadata about the index array self.extras["indextype"] = index.typecode self.extras["indexlen"] = len(index) # Write the extras HashWriter._write_extras(self) # Write the index array index.to_file(dbfile)
def __init__(self, dbfile, allow_offsets=True, cutoff=2**15): assert isinstance(dbfile, StructFile) self._dbfile = dbfile self._count = 0 self._lengths = GrowableArray(allow_longs=False) self._offsets = GrowableArray(allow_longs=False) self._offset_base = 0 self.allow_offsets = allow_offsets self.cutoff = cutoff
class Writer(ColumnWriter): def __init__(self, dbfile, allow_offsets=True, cutoff=2**15): assert isinstance(dbfile, StructFile) self._dbfile = dbfile self._count = 0 self._lengths = GrowableArray(allow_longs=False) self._offsets = GrowableArray(allow_longs=False) self._offset_base = 0 self.allow_offsets = allow_offsets self.cutoff = cutoff def __repr__(self): return "<VarBytes.Writer>" def fill(self, docnum): base = self._offset_base if docnum > self._count: self._lengths.extend(0 for _ in xrange(docnum - self._count)) self._offsets.extend(base for _ in xrange(docnum - self._count)) def add(self, docnum, v): self.fill(docnum) self._dbfile.write(v) self._lengths.append(len(v)) self._offsets.append(self._offset_base) self._offset_base += len(v) self._count = docnum + 1 def finish(self, doccount): dbfile = self._dbfile lengths = self._lengths.array offsets = self._offsets.array self.fill(doccount) dbfile.write_array(lengths) # Only write the offsets if there is a large number of items in the # column, otherwise it's fast enough to derive them from the lens write_offsets = self.allow_offsets and doccount > self.cutoff if write_offsets: dbfile.write_array(offsets) # Backwards compatibility: previous versions only wrote the lengths, # and the last byte of the column was the lengths type code... dbfile.write(lengths.typecode.encode("ascii")) # ...but if we wrote offsets, make the last byte "X" so we know if write_offsets: dbfile.write(offsets.typecode.encode("ascii")) dbfile.write("X".encode("ascii"))
def __init__(self, dbfile): assert isinstance(dbfile, StructFile) self._dbfile = dbfile self._count = 0 self._lengths = GrowableArray(allow_longs=False)
def __init__(self, dbfile): HashWriter.__init__(self, dbfile) # Keep an array of the positions of all keys self.index = GrowableArray("H") # Keep track of the last key added self.lastkey = emptybytes
def start_field(self, fieldname): self.fieldstart = self.dbfile.tell() self.fieldname = fieldname # Keep an array of the positions of all keys self.poses = GrowableArray("H") self.lastkey = emptybytes
def __init__(self, child, fixedlen): self._child = child self._fixedlen = fixedlen self._lengths = GrowableArray() self._count = 0