Пример #1
0
    def to_string(self):
        # Encode the lengths as 0-255 values
        ml = 0 if self._minlength is None else length_to_byte(self._minlength)
        xl = length_to_byte(self._maxlength)
        # Convert None values to the out-of-band NO_ID constant so they can be
        # stored as unsigned ints
        mid = NO_ID if self._minid is None else self._minid
        xid = NO_ID if self._maxid is None else self._maxid

        # Pack the term info into bytes
        st = self.struct.pack(self._weight, self._df, ml, xl, self._maxweight,
                              0, mid, xid)

        if isinstance(self.postings, tuple):
            # Postings are inlined - dump them using the pickle protocol
            isinlined = 1
            st += dumps(self.postings, -1)[2:-1]
        else:
            # Append postings pointer as long to end of term info bytes
            isinlined = 0
            # It's possible for a term info to not have a pointer to postings
            # on disk, in which case postings will be None. Convert a None
            # value to -1 so it can be stored as a long.
            p = -1 if self.postings is None else self.postings
            st += pack_long(p)

        # Prepend byte indicating whether the postings are inlined to the term
        # info bytes
        return pack("B", isinlined) + st
Пример #2
0
    def to_file(self, file, stringids=False):
        flags = 1

        self._blockstart = file.tell()
        self._pointer_pos = self._blockstart + 4
        file.write(
            self._struct.pack(
                flags,
                0,
                0,  # unused B, H
                self.nextoffset,
                self.idslen,
                self.weightslen,
                self.postcount,
                self.maxweight,
                self.maxwol,
                0,
                length_to_byte(self.minlength)))

        # Write the maximum ID after the header. We have to do this
        # separately because it might be a string (in the case of a vector)
        if stringids:
            file.write_string(utf8encode(self.maxid)[0])
        else:
            file.write_uint(self.maxid)
Пример #3
0
def test_block():
    st = RamStorage()
    f = st.create_file("postfile")

    b = current(f, 0)
    b.append(0, 1.0, '', 1)
    b.append(1, 2.0, '', 2)
    b.append(2, 12.0, '', 6)
    b.append(5, 6.5, '', 420)
    assert b

    assert_equal(len(b), 4)
    assert_equal(list(b.ids), [0, 1, 2, 5])
    assert_equal(list(b.weights), [1.0, 2.0, 12.0, 6.5])
    assert_equal(b.values, None)
    assert_equal(b.min_length(), 1)
    assert_equal(b.max_length(), byte_to_length(length_to_byte(420)))
    assert_equal(b.max_weight(), 12.0)
    assert_equal(b.max_wol(), 2.0)

    ti = FileTermInfo()
    ti.add_block(b)
    assert_equal(ti.weight(), 21.5)
    assert_equal(ti.doc_frequency(), 4)
    assert_equal(ti.min_length(), 1)
    assert_equal(ti.max_length(), byte_to_length(length_to_byte(420)))
    assert_equal(ti.max_weight(), 12.0)
    assert_equal(ti.max_wol(), 2.0)

    b.write(compression=3)
    f.close()
    f = st.open_file("postfile")
    bb = current.from_file(f, 0)

    bb.read_ids()
    assert_equal(list(bb.ids), [0, 1, 2, 5])
    bb.read_weights()
    assert_equal(list(bb.weights), [1.0, 2.0, 12.0, 6.5])
    bb.read_values()
    assert_equal(b.values, None)
    assert_equal(bb.min_length(), 1)
    assert_equal(bb.max_length(), byte_to_length(length_to_byte(420)))
    assert_equal(bb.max_weight(), 12.0)
    assert_equal(bb.max_wol(), 2.0)
Пример #4
0
    def to_file(self, postfile, compression=3):
        ids = self.ids
        idcode, idstring = minimize_ids(ids, self.stringids, compression)
        wtstring = minimize_weights(self.weights, compression)
        vstring = minimize_values(self.postingsize, self.values, compression)

        info = (len(ids), ids[-1], self.maxweight,
                length_to_byte(self.minlength), length_to_byte(self.maxlength),
                idcode, compression, len(idstring), len(wtstring))
        infostring = dumps(info, -1)

        # Offset to next block
        postfile.write_uint(
            len(infostring) + len(idstring) + len(wtstring) + len(vstring))
        # Block contents
        postfile.write(infostring)
        postfile.write(idstring)
        postfile.write(wtstring)
        postfile.write(vstring)
Пример #5
0
    def add_block(self, block):
        self._weight += sum(block.weights)
        self._df += len(block)

        ml = length_to_byte(block.min_length())
        if self._minlength is None:
            self._minlength = ml
        else:
            self._minlength = min(self._minlength, ml)

        xl = length_to_byte(block.max_length())
        self._maxlength = max(self._maxlength, xl)

        self._maxweight = max(self._maxweight, block.max_weight())
        self._maxwol = max(self._maxwol, block.max_wol())

        if self._minid is None:
            self._minid = block.ids[0]
        self._maxid = block.ids[-1]
Пример #6
0
    def to_file(self, postfile, compression=3):
        ids = self.ids
        idcode, idstring = minimize_ids(ids, self.stringids, compression)
        wtstring = minimize_weights(self.weights, compression)
        vstring = minimize_values(self.postingsize, self.values, compression)

        info = (len(ids), ids[-1], self.maxweight,
                length_to_byte(self.minlength), length_to_byte(self.maxlength),
                idcode, compression, len(idstring), len(wtstring))
        infostring = dumps(info, -1)

        # Offset to next block
        postfile.write_uint(len(infostring) + len(idstring) + len(wtstring)
                            + len(vstring))
        # Block contents
        postfile.write(infostring)
        postfile.write(idstring)
        postfile.write(wtstring)
        postfile.write(vstring)
Пример #7
0
    def to_file(self, dbfile, doccount):
        self._pad_arrays(doccount)
        fieldnames = list(self.lengths.keys())

        dbfile.write(self.magic)
        dbfile.write_int(1)  # Format version number
        dbfile.write_uint(doccount)  # Number of documents
        dbfile.write_ushort(len(self.lengths))  # Number of fields

        # Write per-field info
        for fieldname in fieldnames:
            dbfile.write_string(fieldname.encode('utf-8'))  # Fieldname
            dbfile.write_long(self.field_length(fieldname))
            dbfile.write_byte(length_to_byte(self.min_field_length(fieldname)))
            dbfile.write_byte(length_to_byte(self.max_field_length(fieldname)))

        # Write byte arrays
        for fieldname in fieldnames:
            dbfile.write_array(self.lengths[fieldname])
        dbfile.close()
Пример #8
0
    def to_file(self, dbfile, doccount):
        self._pad_arrays(doccount)
        fieldnames = list(self.lengths.keys())

        dbfile.write(self.magic)
        dbfile.write_int(1)  # Format version number
        dbfile.write_uint(doccount)  # Number of documents
        dbfile.write_ushort(len(self.lengths))  # Number of fields

        # Write per-field info
        for fieldname in fieldnames:
            dbfile.write_string(fieldname.encode('utf-8'))  # Fieldname
            dbfile.write_long(self.field_length(fieldname))
            dbfile.write_byte(length_to_byte(self.min_field_length(fieldname)))
            dbfile.write_byte(length_to_byte(self.max_field_length(fieldname)))

        # Write byte arrays
        for fieldname in fieldnames:
            dbfile.write_array(self.lengths[fieldname])
        dbfile.close()
Пример #9
0
    def add_field_length(self, docnum, fieldname, length):
        self._fieldlength_totals[fieldname] += length
        if length > self._fieldlength_maxes.get(fieldname, 0):
            self._fieldlength_maxes[fieldname] = length

        if fieldname not in self.length_arrays:
            self.length_arrays[fieldname] = array("B")
        arry = self.length_arrays[fieldname]

        if len(arry) <= docnum:
            for _ in xrange(docnum - len(arry) + 1):
                arry.append(0)
        arry[docnum] = length_to_byte(length)
Пример #10
0
def test_lowlevel_block_writing():
    st = RamStorage()
    f = st.create_file("postfile")
    fpw = FilePostingWriter(f, blocklimit=4)
    fmt = formats.Frequency()
    fpw.start(fmt)
    fpw.write(0, 1.0, fmt.encode(1.0), 1)
    fpw.write(1, 2.0, fmt.encode(2.0), 2)
    fpw.write(2, 12.0, fmt.encode(12.0), 6)
    fpw.write(5, 6.5, fmt.encode(6.5), 420)

    fpw.write(11, 1.5, fmt.encode(1.5), 1)
    fpw.write(12, 2.5, fmt.encode(2.5), 2)
    fpw.write(26, 100.5, fmt.encode(100.5), 21)
    fpw.write(50, 8.0, fmt.encode(8.0), 1020)
    ti = fpw.finish()

    assert_equal(ti.weight(), 134.0)
    assert_equal(ti.doc_frequency(), 8)
    assert_equal(ti.min_length(), 1)
    assert_equal(ti.max_length(), byte_to_length(length_to_byte(1020)))
    assert_equal(ti.max_weight(), 100.5)
    assert_equal(ti.max_wol(), 100.5 / byte_to_length(length_to_byte(21)))
Пример #11
0
    def add(self, docnum, fieldname, length):
        lengths = self.lengths
        if length:
            if fieldname not in lengths:
                self._create_field(fieldname, docnum)

            arry = self.lengths[fieldname]
            count = docnum + 1
            if len(arry) < count:
                for _ in xrange(count - len(arry)):
                    arry.append(0)
            if count > self._count:
                self._count = count
            byte = length_to_byte(length)
            arry[docnum] = byte
            self.totals[fieldname] += length
Пример #12
0
def test_many_lengths():
    domain = u("alfa bravo charlie delta echo foxtrot golf hotel").split()
    schema = fields.Schema(text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    for i, word in enumerate(domain):
        length = (i + 1) ** 6
        w.add_document(text=" ".join(word for _ in xrange(length)))
    w.commit()

    s = ix.searcher()
    for i, word in enumerate(domain):
        target = byte_to_length(length_to_byte((i + 1) ** 6))
        ti = s.term_info("text", word)
        assert_equal(ti.min_length(), target)
        assert_equal(ti.max_length(), target)
Пример #13
0
def test_many_lengths():
    domain = u("alfa bravo charlie delta echo foxtrot golf hotel").split()
    schema = fields.Schema(text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    for i, word in enumerate(domain):
        length = (i + 1) ** 6
        w.add_document(text=" ".join(word for _ in xrange(length)))
    w.commit()

    s = ix.searcher()
    for i, word in enumerate(domain):
        target = byte_to_length(length_to_byte((i + 1) ** 6))
        ti = s.term_info("text", word)
        assert_equal(ti.min_length(), target)
        assert_equal(ti.max_length(), target)
Пример #14
0
    def add(self, docnum, fieldname, length):
        lengths = self.lengths
        if length:
            if fieldname not in lengths:
                self._create_field(fieldname, docnum)

            arry = self.lengths[fieldname]
            count = docnum + 1
            if len(arry) < count:
                for _ in xrange(count - len(arry)):
                    arry.append(0)
            if count > self._count:
                self._count = count
            byte = length_to_byte(length)
            arry[docnum] = byte
            self.totals[fieldname] += length
Пример #15
0
def test_lengths():
    s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True),
                      f2=fields.KEYWORD(stored=True, scorable=True))
    with TempIndex(s, "testlengths") as ix:
        w = ix.writer()
        tokens = u("ABCDEFG")
        from itertools import cycle, islice
        lengths = [10, 20, 2, 102, 45, 3, 420, 2]
        for length in lengths:
            w.add_document(f2=u(" ").join(islice(cycle(tokens), length)))
        w.commit()

        with ix.reader() as dr:
            ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))]
            assert_equal(ls1, [0] * len(lengths))
            ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))]
            assert_equal(ls2, [byte_to_length(length_to_byte(l))for l in lengths])
Пример #16
0
def test_lengths():
    s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True),
                      f2=fields.KEYWORD(stored=True, scorable=True))
    with TempIndex(s, "testlengths") as ix:
        w = ix.writer()
        tokens = u("ABCDEFG")
        from itertools import cycle, islice
        lengths = [10, 20, 2, 102, 45, 3, 420, 2]
        for length in lengths:
            w.add_document(f2=u(" ").join(islice(cycle(tokens), length)))
        w.commit()

        with ix.reader() as dr:
            ls1 = [dr.doc_field_length(i, "f1")
                   for i in xrange(0, len(lengths))]
            assert_equal(ls1, [0] * len(lengths))
            ls2 = [dr.doc_field_length(i, "f2")
                   for i in xrange(0, len(lengths))]
            assert_equal(ls2, [byte_to_length(length_to_byte(l))
                               for l in lengths])
Пример #17
0
 def to_file(self, file, stringids=False):
     flags = 1
     
     self._blockstart = file.tell()
     self._pointer_pos = self._blockstart + 4
     file.write(self._struct.pack(flags,
                                  0, 0, # unused B, H
                                  self.nextoffset,
                                  self.idslen,
                                  self.weightslen,
                                  self.postcount,
                                  self.maxweight, self.maxwol, 0,
                                  length_to_byte(self.minlength)))
     
     # Write the maximum ID after the header. We have to do this
     # separately because it might be a string (in the case of a vector)
     if stringids:
         file.write_string(utf8encode(self.maxid)[0])
     else:
         file.write_uint(self.maxid)
Пример #18
0
    def add_field_length(self, docnum, fieldname, length):
        self._fieldlength_totals[fieldname] += length

        bytelength = length_to_byte(length)
        normalized = byte_to_length(bytelength)

        if normalized < self._fieldlength_mins.get(fieldname, 999999999):
            self._fieldlength_mins[fieldname] = normalized

        if normalized > self._fieldlength_maxes.get(fieldname, 0):
            self._fieldlength_maxes[fieldname] = normalized

        if fieldname not in self.length_arrays:
            self.length_arrays[fieldname] = array("B")
        arry = self.length_arrays[fieldname]

        if len(arry) <= docnum:
            for _ in xrange(docnum - len(arry) + 1):
                arry.append(0)
        arry[docnum] = bytelength
Пример #19
0
    def append(self, id, weight, valuestring, dfl):
        self.ids.append(id)
        self.weights.append(weight)
        if weight > self._maxweight:
            self._maxweight = weight

        if valuestring:
            if self.values is None:
                self.values = []
            self.values.append(valuestring)

        if dfl:
            length_byte = length_to_byte(dfl)
            if self._minlength is None or length_byte < self._minlength:
                self._minlength = length_byte
            if dfl > self._maxlength:
                self._maxlength = length_byte
            wol = weight / byte_to_length(length_byte)
            if wol > self._maxwol:
                self._maxwol = wol
Пример #20
0
    def add_field_length(self, docnum, fieldname, length):
        self._fieldlength_totals[fieldname] += length

        bytelength = length_to_byte(length)
        normalized = byte_to_length(bytelength)

        if normalized < self._fieldlength_mins.get(fieldname, 999999999):
            self._fieldlength_mins[fieldname] = normalized

        if normalized > self._fieldlength_maxes.get(fieldname, 0):
            self._fieldlength_maxes[fieldname] = normalized

        if fieldname not in self.length_arrays:
            self.length_arrays[fieldname] = array("B")
        arry = self.length_arrays[fieldname]

        if len(arry) <= docnum:
            for _ in xrange(docnum - len(arry) + 1):
                arry.append(0)
        arry[docnum] = bytelength
Пример #21
0
    def append(self, id, weight, valuestring, dfl):
        self.ids.append(id)
        self.weights.append(weight)
        if weight > self._maxweight:
            self._maxweight = weight

        if valuestring:
            if self.values is None:
                self.values = []
            self.values.append(valuestring)

        if dfl:
            length_byte = length_to_byte(dfl)
            if self._minlength is None or length_byte < self._minlength:
                self._minlength = length_byte
            if dfl > self._maxlength:
                self._maxlength = length_byte
            wol = weight / byte_to_length(length_byte)
            if wol > self._maxwol:
                self._maxwol = wol
Пример #22
0
 def blen(n):
     return byte_to_length(length_to_byte(n))
Пример #23
0
def test_length_byte():
    source = list(range(11))
    xform = [length_to_byte(n) for n in source]
    result = [byte_to_length(n) for n in xform]
    assert_equal(source, result)
Пример #24
0
    def to_file(self, postfile, posting_size, compression=3):
        stringids = self.stringids
        ids = self.ids
        weights = self.weights
        values = self.values
        postcount = len(ids)
        maxweight, maxwol, minlength = self.stats()

        if postcount <= 4 or not can_compress:
            compression = 0

        # Max ID
        maxid = ids[-1]
        if stringids:
            maxid_string = dumps(maxid, -1)[2:]
        else:
            maxid_string = pack_uint(maxid)

        # IDs
        typecode = "I"
        if stringids:
            ids_string = dumps(ids, -1)[2:]
            typecode = "s"
        else:
            if maxid <= 255:
                typecode = "B"
            elif maxid <= 65535:
                typecode = "H"
            if typecode != ids.typecode:
                ids = array(typecode, ids)
            if not IS_LITTLE:
                ids.byteswap()
            ids_string = ids.tostring()
        if compression:
            ids_string = compress(ids_string, compression)

        # Weights
        if all(w == 1.0 for w in weights):
            weights_string = ''
        else:
            if not IS_LITTLE:
                weights.byteswap()
            weights_string = weights.tostring()
        if weights_string and compression:
            weights_string = compress(weights_string, compression)

        # Values
        if posting_size < 0:
            values_string = dumps(values, -1)[2:]
        elif posting_size == 0:
            values_string = ''
        else:
            values_string = "".join(values)
        if values_string and compression:
            values_string = compress(values_string, compression)

        # Header
        flags = 1 if compression else 0
        minlen_byte = length_to_byte(minlength)
        blocksize = sum((self._struct.size, len(maxid_string), len(ids_string),
                         len(weights_string), len(values_string)))
        header = self._struct.pack(blocksize, flags, postcount, typecode, 0,
                                   len(ids_string), len(weights_string),
                                   maxweight, maxwol, 0, minlen_byte)

        postfile.write(header)
        postfile.write(maxid_string)
        postfile.write(ids_string)
        postfile.write(weights_string)
        postfile.write(values_string)
Пример #25
0
def _discreet(length):
    return byte_to_length(length_to_byte(length))
Пример #26
0
 def to_file(self, postfile, posting_size, compression=3):
     stringids = self.stringids
     ids = self.ids
     weights = self.weights
     values = self.values
     postcount = len(ids)
     maxweight, maxwol, minlength = self.stats()
     
     if postcount <= 4 or not can_compress:
         compression = 0
     
     # Max ID
     maxid = ids[-1]
     if stringids:
         maxid_string = dumps(maxid, -1)[2:]
     else:
         maxid_string = pack_uint(maxid)
     
     # IDs
     typecode = "I"
     if stringids:
         ids_string = dumps(ids, -1)[2:]
         typecode = "s"
     else:
         if maxid <= 255:
             typecode = "B"
         elif maxid <= 65535:
             typecode = "H"
         if typecode != ids.typecode:
             ids = array(typecode, ids)
         if not IS_LITTLE:
             ids.byteswap()
         ids_string = ids.tostring()
     if compression:
         ids_string = compress(ids_string, compression)
     
     # Weights
     if all(w == 1.0 for w in weights):
         weights_string = ''
     else:
         if not IS_LITTLE:
             weights.byteswap()
         weights_string = weights.tostring()
     if weights_string and compression:
         weights_string = compress(weights_string, compression)
     
     # Values
     if posting_size < 0:
         values_string = dumps(values, -1)[2:]
     elif posting_size == 0:
         values_string = ''
     else:
         values_string = "".join(values)
     if values_string and compression:
         values_string = compress(values_string, compression)
     
     # Header
     flags = 1 if compression else 0
     minlen_byte = length_to_byte(minlength)
     blocksize = sum((self._struct.size, len(maxid_string), len(ids_string),
                      len(weights_string), len(values_string)))
     header = self._struct.pack(blocksize, flags, postcount, typecode,
                                0, len(ids_string), len(weights_string),
                                maxweight, maxwol, 0, minlen_byte)
     
     postfile.write(header)
     postfile.write(maxid_string)
     postfile.write(ids_string)
     postfile.write(weights_string)
     postfile.write(values_string)
Пример #27
0
def _discreet(length):
    return byte_to_length(length_to_byte(length))
Пример #28
0
def test_length_byte():
    source = list(range(11))
    xform = [length_to_byte(n) for n in source]
    result = [byte_to_length(n) for n in xform]
    assert_equal(source, result)