示例#1
0
def test_pickle_schema():
    from whoosh import analysis
    from whoosh.support.charset import accent_map
    from whoosh.compat import dumps

    freetext_analyzer = (analysis.StemmingAnalyzer()
                         | analysis.CharsetFilter(accent_map))

    schema = fields.Schema(path=fields.ID(stored=True, unique=True),
                           file_mtime=fields.DATETIME(stored=True),
                           name=fields.TEXT(stored=False, field_boost=2.0),
                           description=fields.TEXT(stored=False,
                                                   field_boost=1.5,
                                                   analyzer=freetext_analyzer),
                           content=fields.TEXT(analyzer=freetext_analyzer))

    # Try to make some sentences that will require stemming
    docs = [
        u"The rain in spain falls mainly in the plain",
        u"Plainly sitting on the plain",
        u"Imagine a greatly improved sentence here"
    ]

    with TempIndex(schema) as ix:
        with ix.writer() as w:
            for doc in docs:
                w.add_document(description=doc, content=doc)

        assert dumps(schema, 2)

        with ix.reader() as r:
            assert dumps(r.schema, 2)
示例#2
0
def test_pickle_schema():
    from whoosh import analysis
    from whoosh.support.charset import accent_map
    from whoosh.compat import dumps

    freetext_analyzer = (
        analysis.StemmingAnalyzer() |
        analysis.CharsetFilter(accent_map)
    )

    schema = fields.Schema(
        path=fields.ID(stored=True, unique=True),
        file_mtime=fields.DATETIME(stored=True),
        name=fields.TEXT(stored=False, field_boost=2.0),
        description=fields.TEXT(stored=False, field_boost=1.5,
                                analyzer=freetext_analyzer),
        content=fields.TEXT(analyzer=freetext_analyzer)
    )

    # Try to make some sentences that will require stemming
    docs = [
        u"The rain in spain falls mainly in the plain",
        u"Plainly sitting on the plain",
        u"Imagine a greatly improved sentence here"
    ]

    with TempIndex(schema) as ix:
        with ix.writer() as w:
            for doc in docs:
                w.add_document(description=doc, content=doc)

        assert dumps(schema, 2)

        with ix.reader() as r:
            assert dumps(r.schema, 2)
示例#3
0
def test_charset_pickeability():
    from whoosh.support import charset
    charmap = charset.charset_table_to_dict(charset.default_charset)
    ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap)
    _ = dumps(ana, -1)

    ana = analysis.CharsetTokenizer(charmap)
    _ = dumps(ana, -1)
示例#4
0
def test_charset_pickeability():
    from whoosh.support import charset
    charmap = charset.charset_table_to_dict(charset.default_charset)
    ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap)
    _ = dumps(ana, -1)

    ana = analysis.CharsetTokenizer(charmap)
    _ = dumps(ana, -1)
示例#5
0
    def _write_block(self, last=False):
        # Write the buffered block to the postings file

        # If this is the first block, write a small header first
        if not self._blockcount:
            self._postfile.write(WHOOSH3_HEADER_MAGIC)

        # Add this block's statistics to the terminfo object, which tracks the
        # overall statistics for all term postings
        self._terminfo.add_block(self)

        # Minify the IDs, weights, and values, and put them in a tuple
        data = (self._mini_ids(), self._mini_weights(), self._mini_values())
        # Pickle the tuple
        databytes = dumps(data)
        # If the pickle is less than 20 bytes, don't bother compressing
        if len(databytes) < 20:
            comp = 0
        # Compress the pickle (if self._compression > 0)
        comp = self._compression
        if comp:
            databytes = zlib.compress(databytes, comp)

        # Make a tuple of block info. The posting reader can check this info
        # and decide whether to skip the block without having to decompress the
        # full block data
        #
        # - Number of postings in block
        # - Last ID in block
        # - Maximum weight in block
        # - Compression level
        # - Minimum length byte
        # - Maximum length byte
        ids = self._ids
        infobytes = dumps((
            len(ids),
            ids[-1],
            self._maxweight,
            comp,
            length_to_byte(self._minlength),
            length_to_byte(self._maxlength),
        ))

        # Write block length
        postfile = self._postfile
        blocklength = len(infobytes) + len(databytes)
        if last:
            # If this is the last block, use a negative number
            blocklength *= -1
        postfile.write_int(blocklength)
        # Write block info
        postfile.write(infobytes)
        # Write block data
        postfile.write(databytes)

        self._blockcount += 1
        # Reset block buffer
        self._new_block()
    def _write_block(self, last=False):
        # Write the buffered block to the postings file

        # If this is the first block, write a small header first
        if not self._blockcount:
            self._postfile.write(WHOOSH3_HEADER_MAGIC)

        # Add this block's statistics to the terminfo object, which tracks the
        # overall statistics for all term postings
        self._terminfo.add_block(self)

        # Minify the IDs, weights, and values, and put them in a tuple
        data = (self._mini_ids(), self._mini_weights(), self._mini_values())
        # Pickle the tuple
        databytes = dumps(data)
        # If the pickle is less than 20 bytes, don't bother compressing
        if len(databytes) < 20:
            comp = 0
        # Compress the pickle (if self._compression > 0)
        comp = self._compression
        if comp:
            databytes = zlib.compress(databytes, comp)

        # Make a tuple of block info. The posting reader can check this info
        # and decide whether to skip the block without having to decompress the
        # full block data
        #
        # - Number of postings in block
        # - Last ID in block
        # - Maximum weight in block
        # - Compression level
        # - Minimum length byte
        # - Maximum length byte
        ids = self._ids
        infobytes = dumps((len(ids), ids[-1], self._maxweight, comp,
                           length_to_byte(self._minlength),
                           length_to_byte(self._maxlength),
                           ))

        # Write block length
        postfile = self._postfile
        blocklength = len(infobytes) + len(databytes)
        if last:
            # If this is the last block, use a negative number
            blocklength *= -1
        postfile.write_int(blocklength)
        # Write block info
        postfile.write(infobytes)
        # Write block data
        postfile.write(databytes)

        self._blockcount += 1
        # Reset block buffer
        self._new_block()
示例#7
0
 def encode(self, poslist):
     deltas = []
     base = 0
     for pos in poslist:
         deltas.append(pos - base)
         base = pos
     return pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
示例#8
0
def test_pickleability():
    # Ignore base classes
    ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn)
    # Required arguments
    init_args = {
        "ClampedNumericColumn": (columns.NumericColumn("B"), ),
        "FixedBytesColumn": (5, ),
        "FixedBytesListColumn": (5, ),
        "NumericColumn": ("i", ),
        "PickleColumn": (columns.VarBytesColumn(), ),
        "StructColumn": ("=if", (0, 0.0)),
    }

    coltypes = [
        c for _, c in inspect.getmembers(columns, inspect.isclass)
        if issubclass(c, columns.Column) and not c in ignore
    ]

    for coltype in coltypes:
        args = init_args.get(coltype.__name__, ())
        try:
            inst = coltype(*args)
        except TypeError:
            e = sys.exc_info()[1]
            raise TypeError("Error instantiating %r: %s" % (coltype, e))
        _ = loads(dumps(inst, -1))
示例#9
0
 def encode(self, positions):
     codes = []
     base = 0
     for pos in positions:
         codes.append(pos - base)
         base = pos
     return pack_uint(len(codes)) + dumps(codes, -1)[2:-1]
示例#10
0
 def encode(self, poslist):
     deltas = []
     base = 0
     for pos in poslist:
         deltas.append(pos - base)
         base = pos
     return pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
示例#11
0
 def encode(self, positions):
     codes = []
     base = 0
     for pos in positions:
         codes.append(pos - base)
         base = pos
     return pack_uint(len(codes)) + dumps(codes, -1)[2:-1]
示例#12
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))

        for w, poses in iteritems(seen):
            # posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
            codes = []
            posbase = 0
            charbase = 0
            summedboost = 0
            for pos, startchar, endchar, boost in poses:
                codes.append((pos - posbase, startchar - charbase,
                              endchar - startchar, boost))
                posbase = pos
                charbase = endchar
                summedboost += boost

            value = (pack_uint(len(poses)) + pack_float(summedboost * fb)
                     + dumps(codes, -1)[2:-1])

            yield (w, len(poses), summedboost * fb, value)
示例#13
0
    def to_string(self):
        # Encode the lengths as 0-255 values
        ml = 0 if self._minlength is None else length_to_byte(self._minlength)
        xl = length_to_byte(self._maxlength)
        # Convert None values to the out-of-band NO_ID constant so they can be
        # stored as unsigned ints
        mid = NO_ID if self._minid is None else self._minid
        xid = NO_ID if self._maxid is None else self._maxid

        # Pack the term info into bytes
        st = self.struct.pack(self._weight, self._df, ml, xl, self._maxweight,
                              0, mid, xid)

        if isinstance(self.postings, tuple):
            # Postings are inlined - dump them using the pickle protocol
            isinlined = 1
            st += dumps(self.postings, -1)[2:-1]
        else:
            # Append postings pointer as long to end of term info bytes
            isinlined = 0
            # It's possible for a term info to not have a pointer to postings
            # on disk, in which case postings will be None. Convert a None
            # value to -1 so it can be stored as a long.
            p = -1 if self.postings is None else self.postings
            st += pack_long(p)

        # Prepend byte indicating whether the postings are inlined to the term
        # info bytes
        return pack("B", isinlined) + st
示例#14
0
 def encode(self, poslist):
     deltas = []
     posbase = 0
     charbase = 0
     for pos, startchar, endchar in poslist:
         deltas.append((pos - posbase, startchar - charbase,
                        endchar - startchar))
         posbase = pos
         charbase = endchar
     return pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
示例#15
0
 def encode(self, poses):
     codes = []
     base = 0
     summedboost = 0
     for pos, boost in poses:
         summedboost += boost
         codes.append((pos - base, boost))
         base = pos
     return (pack_uint(len(poses)) + pack_float(summedboost)
             + dumps(codes, -1)[2:-1])
示例#16
0
def minimize_values(postingsize, values, compression=0):
    if postingsize < 0:
        string = dumps(values, -1)[2:]
    elif postingsize == 0:
        string = b('')
    else:
        string = b('').join(values)
    if string and compression:
        string = compress(string, compression)
    return string
示例#17
0
 def encode(self, poslist):
     deltas = []
     posbase = 0
     charbase = 0
     for pos, startchar, endchar in poslist:
         deltas.append(
             (pos - posbase, startchar - charbase, endchar - startchar))
         posbase = pos
         charbase = endchar
     return pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
示例#18
0
 def encode(self, poses):
     codes = []
     base = 0
     summedboost = 0
     for pos, boost in poses:
         summedboost += boost
         codes.append((pos - base, boost))
         base = pos
     return (pack_uint(len(poses)) + pack_float(summedboost) +
             dumps(codes, -1)[2:-1])
示例#19
0
 def encode(self, posns_chars):
     # posns_chars = [(pos, startchar, endchar), ...]
     codes = []
     posbase = 0
     charbase = 0
     for pos, startchar, endchar in posns_chars:
         codes.append((pos - posbase, startchar - charbase,
                       endchar - startchar))
         posbase = pos
         charbase = endchar
     return pack_uint(len(posns_chars)) + dumps(codes, -1)[2:-1]
示例#20
0
 def encode(self, posns_chars):
     # posns_chars = [(pos, startchar, endchar), ...]
     codes = []
     posbase = 0
     charbase = 0
     for pos, startchar, endchar in posns_chars:
         codes.append(
             (pos - posbase, startchar - charbase, endchar - startchar))
         posbase = pos
         charbase = endchar
     return pack_uint(len(posns_chars)) + dumps(codes, -1)[2:-1]
示例#21
0
    def encode(self, posns_chars_boosts):
        # posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
        codes = []
        posbase = 0
        charbase = 0
        summedboost = 0
        for pos, startchar, endchar, boost in posns_chars_boosts:
            codes.append((pos - posbase, startchar - charbase,
                          endchar - startchar, boost))
            posbase = pos
            charbase = endchar
            summedboost += boost

        return (pack_uint(len(posns_chars_boosts)) + pack_float(summedboost)
                + dumps(codes, -1)[2:-1])
示例#22
0
    def encode(self, posns_chars_boosts):
        # posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
        codes = []
        posbase = 0
        charbase = 0
        summedboost = 0
        for pos, startchar, endchar, boost in posns_chars_boosts:
            codes.append((pos - posbase, startchar - charbase,
                          endchar - startchar, boost))
            posbase = pos
            charbase = endchar
            summedboost += boost

        return (pack_uint(len(posns_chars_boosts)) + pack_float(summedboost) +
                dumps(codes, -1)[2:-1])
示例#23
0
    def encode(self, poses):
        fb = self.field_boost
        # posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
        codes = []
        posbase = 0
        charbase = 0
        summedboost = 0
        for pos, startchar, endchar, boost in poses:
            codes.append((pos - posbase, startchar - charbase,
                          endchar - startchar, boost))
            posbase = pos
            charbase = endchar
            summedboost += boost

        return ((pack_uint(len(poses)) + pack_float(summedboost * fb)
                 + dumps(codes, 2)), summedboost)
示例#24
0
    def encode(self, poses):
        fb = self.field_boost
        # posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
        codes = []
        posbase = 0
        charbase = 0
        summedboost = 0
        for pos, startchar, endchar, boost in poses:
            codes.append((pos - posbase, startchar - charbase,
                          endchar - startchar, boost))
            posbase = pos
            charbase = endchar
            summedboost += boost

        return ((pack_uint(len(poses)) + pack_float(summedboost * fb) +
                 dumps(codes, 2)), summedboost)
示例#25
0
    def append(self, values):
        f = self.dbfile

        name_map = self.name_map

        vlist = [None] * len(name_map)
        for k, v in iteritems(values):
            if k in name_map:
                vlist[name_map[k]] = v
            else:
                # For dynamic stored fields, put them at the end of the list
                # as a tuple of (fieldname, value)
                vlist.append((k, v))

        v = dumps(vlist, -1)[2:-1]
        self.length += 1
        self.directory.append(pack_stored_pointer(f.tell(), len(v)))
        f.write(v)
示例#26
0
    def add(self, vdict):
        f = self.dbfile
        names = self.names
        name_map = self.name_map

        vlist = [None] * len(names)
        for k, v in iteritems(vdict):
            if k in name_map:
                vlist[name_map[k]] = v
            else:
                name_map[k] = len(names)
                names.append(k)
                vlist.append(v)

        vstring = dumps(tuple(vlist), -1)[2:-1]
        self.length += 1
        self.directory.append(pack_stored_pointer(f.tell(), len(vstring)))
        f.write(vstring)
示例#27
0
    def add(self, vdict):
        f = self.dbfile
        names = self.names
        name_map = self.name_map

        vlist = [None] * len(names)
        for k, v in iteritems(vdict):
            if k in name_map:
                vlist[name_map[k]] = v
            else:
                name_map[k] = len(names)
                names.append(k)
                vlist.append(v)

        vstring = dumps(tuple(vlist), -1)[2:-1]
        self.length += 1
        self.directory.append(pack_stored_pointer(f.tell(), len(vstring)))
        f.write(vstring)
示例#28
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        poses = defaultdict(list)
        weights = defaultdict(float)
        kwargs["positions"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            poses[t.text].append(t.pos)
            weights[t.text] += t.boost

        for w, poslist in iteritems(poses):
            deltas = []
            base = 0
            for pos in poslist:
                deltas.append(pos - base)
                base = pos
            value = pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
            yield (w, len(poslist), weights[w] * fb, value)
示例#29
0
    def to_file(self, postfile, compression=3):
        ids = self.ids
        idcode, idstring = minimize_ids(ids, self.stringids, compression)
        wtstring = minimize_weights(self.weights, compression)
        vstring = minimize_values(self.postingsize, self.values, compression)

        info = (len(ids), ids[-1], self.maxweight,
                length_to_byte(self.minlength), length_to_byte(self.maxlength),
                idcode, compression, len(idstring), len(wtstring))
        infostring = dumps(info, -1)

        # Offset to next block
        postfile.write_uint(
            len(infostring) + len(idstring) + len(wtstring) + len(vstring))
        # Block contents
        postfile.write(infostring)
        postfile.write(idstring)
        postfile.write(wtstring)
        postfile.write(vstring)
示例#30
0
    def to_file(self, postfile, compression=3):
        ids = self.ids
        idcode, idstring = minimize_ids(ids, self.stringids, compression)
        wtstring = minimize_weights(self.weights, compression)
        vstring = minimize_values(self.postingsize, self.values, compression)

        info = (len(ids), ids[-1], self.maxweight,
                length_to_byte(self.minlength), length_to_byte(self.maxlength),
                idcode, compression, len(idstring), len(wtstring))
        infostring = dumps(info, -1)

        # Offset to next block
        postfile.write_uint(len(infostring) + len(idstring) + len(wtstring)
                            + len(vstring))
        # Block contents
        postfile.write(infostring)
        postfile.write(idstring)
        postfile.write(wtstring)
        postfile.write(vstring)
示例#31
0
def minimize_ids(arry, stringids, compression=0):
    amax = arry[-1]

    if stringids:
        typecode = ''
        string = dumps(arry)
    else:
        code = arry.typecode
        if amax <= 255:
            typecode = "B"
        elif amax <= 65535:
            typecode = "H"
        if typecode != code:
            arry = array(typecode, iter(arry))
        if not IS_LITTLE:
            arry.byteswap()
        string = arry.tostring()
    if compression:
        string = compress(string, compression)
    return (typecode, string)
示例#32
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            pos = t.pos
            boost = t.boost
            seen[t.text].append((pos, boost))

        for w, poses in iteritems(seen):
            codes = []
            base = 0
            summedboost = 0
            for pos, boost in poses:
                summedboost += boost
                codes.append((pos - base, boost))
                base = pos
            value = (pack_uint(len(poses)) + pack_float(summedboost)
                     + dumps(codes, -1)[2:-1])
            yield (w, len(poses), sum(p[1] for p in poses) * fb, value)
示例#33
0
def test_pickleability():
    # Ignore base classes
    ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn)
    # Required arguments
    init_args = {"ClampedNumericColumn": (columns.NumericColumn("B"),),
                 "FixedBytesColumn": (5,),
                 "FixedBytesListColumn": (5,),
                 "NumericColumn": ("i",),
                 "PickleColumn": (columns.VarBytesColumn(),),
                 "StructColumn": ("=if", (0, 0.0)),
                 }

    coltypes = [c for _, c in inspect.getmembers(columns, inspect.isclass)
                if issubclass(c, columns.Column) and not c in ignore]

    for coltype in coltypes:
        args = init_args.get(coltype.__name__, ())
        try:
            inst = coltype(*args)
        except TypeError:
            e = sys.exc_info()[1]
            raise TypeError("Error instantiating %r: %s" % (coltype, e))
        _ = loads(dumps(inst, -1))
示例#34
0
    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)
        weights = defaultdict(float)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar))
            weights[t.text] += t.boost

        for w, poslist in iteritems(seen):
            deltas = []
            posbase = 0
            charbase = 0
            for pos, startchar, endchar in poslist:
                deltas.append((pos - posbase, startchar - charbase,
                               endchar - startchar))
                posbase = pos
                charbase = endchar
                value = pack_uint(len(deltas)) + dumps(deltas, -1)[2:-1]
            yield (w, len(poslist), weights[w] * fb, value)
示例#35
0
    def to_bytes(self):
        isinlined = self.is_inlined()

        # Encode the lengths as 0-255 values
        minlength = (0 if self._minlength is None else length_to_byte(
            self._minlength))
        maxlength = length_to_byte(self._maxlength)
        # Convert None values to the out-of-band NO_ID constant so they can be
        # stored as unsigned ints
        minid = 0xffffffff if self._minid is None else self._minid
        maxid = 0xffffffff if self._maxid is None else self._maxid

        # Pack the term info into bytes
        st = self._struct.pack(isinlined, self._weight, self._df, minlength,
                               maxlength, self._maxweight, minid, maxid)

        if isinlined:
            # Postings are inlined - _tmp them using the pickle protocol
            postbytes = dumps(self._inlined, -1)
        else:
            postbytes = pack_long(self._offset) + pack_int(self._length)
        st += postbytes
        return st
    def to_bytes(self):
        isinlined = self.is_inlined()

        # Encode the lengths as 0-255 values
        minlength = (0 if self._minlength is None
                     else length_to_byte(self._minlength))
        maxlength = length_to_byte(self._maxlength)
        # Convert None values to the out-of-band NO_ID constant so they can be
        # stored as unsigned ints
        minid = 0xffffffff if self._minid is None else self._minid
        maxid = 0xffffffff if self._maxid is None else self._maxid

        # Pack the term info into bytes
        st = self._struct.pack(isinlined, self._weight, self._df,
                               minlength, maxlength, self._maxweight,
                               minid, maxid)

        if isinlined:
            # Postings are inlined - dump them using the pickle protocol
            postbytes = dumps(self._inlined, -1)
        else:
            postbytes = pack_long(self._offset) + pack_int(self._length)
        st += postbytes
        return st
示例#37
0
def test_la_pickleability():
    ana = analysis.LanguageAnalyzer("en")
    _ = dumps(ana, -1)
 def add(self, docnum, v):
     if v is None:
         v = emptybytes
     else:
         v = dumps(v, -1)
     self._child.add(docnum, v)
示例#39
0
 def add_field(self, fieldname, fieldobj, value, length):
     if value is not None:
         value = dumps(value, -1)
     self._print_line(2, "DOCFIELD", fn=fieldname, v=value, len=length)
示例#40
0
def test_la_pickleability():
    ana = analysis.LanguageAnalyzer("en")
    _ = dumps(ana, -1)
 def add_field(self, fieldname, fieldobj, value, length):
     if value is not None:
         value = dumps(value, -1)
     self._print_line(2, "DOCFIELD", fn=fieldname, v=value, len=length)
示例#42
0
    def write(self, compression=3):
        postfile = self.postfile
        stringids = self.stringids
        ids = self.ids
        weights = self.weights
        values = self.values
        postcount = len(ids)

        if postcount <= 4 or not can_compress:
            compression = 0

        # Max ID
        maxid = ids[-1]
        if stringids:
            maxid_string = dumps(maxid, -1)[2:]
        else:
            maxid_string = pack_uint(maxid)

        # IDs
        typecode = "I"
        if stringids:
            ids_string = dumps(ids, -1)[2:]
            typecode = "s"
        else:
            if maxid <= 255:
                typecode = "B"
            elif maxid <= 65535:
                typecode = "H"
            if typecode != ids.typecode:
                ids = array(typecode, iter(ids))
            if not IS_LITTLE:
                ids.byteswap()
            ids_string = ids.tostring()
        if compression:
            ids_string = compress(ids_string, compression)

        # Weights
        if all(w == 1.0 for w in weights):
            weights_string = b('')
        else:
            if not IS_LITTLE:
                weights.byteswap()
            weights_string = weights.tostring()
        if weights_string and compression:
            weights_string = compress(weights_string, compression)

        # Values
        postingsize = self.postingsize
        if postingsize < 0:
            values_string = dumps(values, -1)[2:]
        elif postingsize == 0:
            values_string = b('')
        else:
            values_string = b("").join(values)
        if values_string and compression:
            values_string = compress(values_string, compression)

        # Header
        flags = 1 if compression else 0
        blocksize = sum((self._struct.size, len(maxid_string), len(ids_string),
                         len(weights_string), len(values_string)))
        header = self._struct.pack(blocksize, flags, postcount,
                                   typecode.encode('latin-1'), 0,
                                   len(ids_string), len(weights_string),
                                   self.max_weight(), self.max_wol(), 0, 0,
                                   self._maxlength, self._minlength or 0)

        postfile.write(header)
        postfile.write(maxid_string)
        postfile.write(ids_string)
        postfile.write(weights_string)
        postfile.write(values_string)
示例#43
0
    def write(self, compression=3):
        postfile = self.postfile
        stringids = self.stringids
        ids = self.ids
        weights = self.weights
        values = self.values
        postcount = len(ids)

        if postcount <= 4 or not can_compress:
            compression = 0

        # Max ID
        maxid = ids[-1]
        if stringids:
            maxid_string = dumps(maxid, -1)[2:]
        else:
            maxid_string = pack_uint(maxid)

        # IDs
        typecode = "I"
        if stringids:
            ids_string = dumps(ids, -1)[2:]
            typecode = "s"
        else:
            if maxid <= 255:
                typecode = "B"
            elif maxid <= 65535:
                typecode = "H"
            if typecode != ids.typecode:
                ids = array(typecode, iter(ids))
            if not IS_LITTLE:
                ids.byteswap()
            ids_string = ids.tostring()
        if compression:
            ids_string = compress(ids_string, compression)

        # Weights
        if all(w == 1.0 for w in weights):
            weights_string = b('')
        else:
            if not IS_LITTLE:
                weights.byteswap()
            weights_string = weights.tostring()
        if weights_string and compression:
            weights_string = compress(weights_string, compression)

        # Values
        postingsize = self.postingsize
        if postingsize < 0:
            values_string = dumps(values, -1)[2:]
        elif postingsize == 0:
            values_string = b('')
        else:
            values_string = b("").join(values)
        if values_string and compression:
            values_string = compress(values_string, compression)

        # Header
        flags = 1 if compression else 0
        blocksize = sum((self._struct.size, len(maxid_string), len(ids_string),
                         len(weights_string), len(values_string)))
        header = self._struct.pack(blocksize, flags, postcount,
                                   typecode.encode('latin-1'), 0,
                                   len(ids_string), len(weights_string),
                                   self.max_weight(), self.max_wol(), 0, 0,
                                   self._maxlength, self._minlength or 0)

        postfile.write(header)
        postfile.write(maxid_string)
        postfile.write(ids_string)
        postfile.write(weights_string)
        postfile.write(values_string)
示例#44
0
 def add(self, docnum, v):
     if v is None:
         v = emptybytes
     else:
         v = dumps(v, 2)
     self._child.add(docnum, v)