Пример #1
0
    def _read_header(self, dbfile, doccount):
        first = dbfile.read(4)  # Magic
        assert first == self.magic
        version = dbfile.read_int()  # Version number
        assert version == 1

        dc = dbfile.read_uint()  # Number of documents saved
        if doccount is None:
            doccount = dc
        assert dc == doccount, "read=%s argument=%s" % (dc, doccount)
        self._count = doccount

        fieldcount = dbfile.read_ushort()  # Number of fields
        # Read per-field info
        for i in xrange(fieldcount):
            fieldname = dbfile.read_string().decode('utf-8')
            self.totals[fieldname] = dbfile.read_long()
            self.minlens[fieldname] = byte_to_length(dbfile.read_byte())
            self.maxlens[fieldname] = byte_to_length(dbfile.read_byte())
            self.starts[fieldname] = i * doccount

        # Add header length to per-field offsets
        eoh = dbfile.tell()  # End of header
        for fieldname in self.starts:
            self.starts[fieldname] += eoh
Пример #2
0
    def _read_header(self, dbfile, doccount):
        first = dbfile.read(4)  # Magic
        assert first == self.magic
        version = dbfile.read_int()  # Version number
        assert version == 1

        dc = dbfile.read_uint()  # Number of documents saved
        if doccount is None:
            doccount = dc
        assert dc == doccount, "read=%s argument=%s" % (dc, doccount)
        self._count = doccount

        fieldcount = dbfile.read_ushort()  # Number of fields
        # Read per-field info
        for i in xrange(fieldcount):
            fieldname = dbfile.read_string().decode('utf-8')
            self.totals[fieldname] = dbfile.read_long()
            self.minlens[fieldname] = byte_to_length(dbfile.read_byte())
            self.maxlens[fieldname] = byte_to_length(dbfile.read_byte())
            self.starts[fieldname] = i * doccount

        # Add header length to per-field offsets
        eoh = dbfile.tell()  # End of header
        for fieldname in self.starts:
            self.starts[fieldname] += eoh
Пример #3
0
    def from_file(file, stringids=False):
        here = file.tell()

        encoded_header = file.read(BlockInfo._struct.size)
        header = BlockInfo._struct.unpack(encoded_header)
        (flags, _, _, nextoffset, idslen, weightslen, postcount, maxweight,
         maxwol, _, minlength) = header

        if not flags:
            nextoffset = unpack_long(encoded_header[:8])
        else:
            nextoffset = here + nextoffset

        assert postcount > 0
        minlength = byte_to_length(minlength)

        if stringids:
            maxid = utf8decode(file.read_string())[0]
        else:
            maxid = file.read_uint()

        dataoffset = file.tell()
        return BlockInfo(flags=flags,
                         nextoffset=nextoffset,
                         postcount=postcount,
                         maxweight=maxweight,
                         maxwol=maxwol,
                         maxid=maxid,
                         minlength=minlength,
                         dataoffset=dataoffset,
                         idslen=idslen,
                         weightslen=weightslen)
Пример #4
0
 def from_file(file, stringids=False):
     here = file.tell()
     
     encoded_header = file.read(BlockInfo._struct.size)
     header = BlockInfo._struct.unpack(encoded_header)
     (flags, _, _, nextoffset, idslen, weightslen, postcount, maxweight,
      maxwol, _, minlength) = header
     
     if not flags:
         nextoffset = unpack_long(encoded_header[:8])
     else:
         nextoffset = here + nextoffset
     
     assert postcount > 0
     minlength = byte_to_length(minlength)
     
     if stringids:
         maxid = utf8decode(file.read_string())[0]
     else:
         maxid = file.read_uint()
     
     dataoffset = file.tell()
     return BlockInfo(flags=flags, nextoffset=nextoffset,
                      postcount=postcount, maxweight=maxweight,
                      maxwol=maxwol, maxid=maxid, minlength=minlength,
                      dataoffset=dataoffset, idslen=idslen,
                      weightslen=weightslen)
Пример #5
0
 def get(self, docnum, fieldname, default=0):
     try:
         arry = self.lengths[fieldname]
     except KeyError:
         return default
     if docnum >= len(arry):
         return default
     return byte_to_length(arry[docnum])
Пример #6
0
 def doc_field_length(self, docnum, fieldname, default=0):
     try:
         arry = self.lengths[fieldname]
     except KeyError:
         return default
     if docnum >= len(arry):
         return default
     return byte_to_length(arry[docnum])
Пример #7
0
def load_old_lengths(obj, dbfile, doccount):
    fieldcount = dbfile.read_ushort()  # Number of fields
    for _ in xrange(fieldcount):
        fieldname = dbfile.read_string().decode("utf-8")
        obj.lengths[fieldname] = dbfile.read_array("B", doccount)
        # Old format didn't store totals, so fake it by adding up the codes
        obj.totals[fieldname] = sum(
            byte_to_length(b) for b in obj.lengths[fieldname])
    dbfile.close()
Пример #8
0
def load_old_lengths(obj, dbfile, doccount):
    fieldcount = dbfile.read_ushort()  # Number of fields
    for _ in xrange(fieldcount):
        fieldname = dbfile.read_string().decode("utf-8")
        obj.lengths[fieldname] = dbfile.read_array("B", doccount)
        # Old format didn't store totals, so fake it by adding up the codes
        obj.totals[fieldname] = sum(byte_to_length(b) for b
                                    in obj.lengths[fieldname])
    dbfile.close()
Пример #9
0
    def from_file(cls, postfile, postingsize, stringids=False):
        start = postfile.tell()
        block = cls(postingsize, stringids=stringids)
        block.postfile = postfile
        header = cls._struct.unpack(postfile.read(cls._struct.size))
        block.nextoffset = start + header[0]
        block.cmp = header[1]
        block.count = header[2]
        block.idcode = header[3].decode("Latin1")
        block.idslen = header[5]
        block.wtslen = header[6]
        block.maxweight = header[7]
        block.maxlength = byte_to_length(header[11])
        block.minlength = byte_to_length(header[12])

        block.maxid = load(postfile) if stringids else postfile.read_uint()
        block.dataoffset = postfile.tell()
        return block
Пример #10
0
    def from_file(cls, postfile, postingsize, stringids=False):
        start = postfile.tell()
        block = cls(postingsize, stringids=stringids)
        block.postfile = postfile
        header = cls._struct.unpack(postfile.read(cls._struct.size))
        block.nextoffset = start + header[0]
        block.cmp = header[1]
        block.count = header[2]
        block.idcode = header[3]
        block.idslen = header[5]
        block.wtslen = header[6]
        block.maxweight = header[7]
        block.maxlength = byte_to_length(header[11])
        block.minlength = byte_to_length(header[12])

        block.maxid = load(postfile) if stringids else postfile.read_uint()
        block.dataoffset = postfile.tell()
        return block
Пример #11
0
def test_block():
    st = RamStorage()
    f = st.create_file("postfile")

    b = current(f, 0)
    b.append(0, 1.0, '', 1)
    b.append(1, 2.0, '', 2)
    b.append(2, 12.0, '', 6)
    b.append(5, 6.5, '', 420)
    assert b

    assert_equal(len(b), 4)
    assert_equal(list(b.ids), [0, 1, 2, 5])
    assert_equal(list(b.weights), [1.0, 2.0, 12.0, 6.5])
    assert_equal(b.values, None)
    assert_equal(b.min_length(), 1)
    assert_equal(b.max_length(), byte_to_length(length_to_byte(420)))
    assert_equal(b.max_weight(), 12.0)
    assert_equal(b.max_wol(), 2.0)

    ti = FileTermInfo()
    ti.add_block(b)
    assert_equal(ti.weight(), 21.5)
    assert_equal(ti.doc_frequency(), 4)
    assert_equal(ti.min_length(), 1)
    assert_equal(ti.max_length(), byte_to_length(length_to_byte(420)))
    assert_equal(ti.max_weight(), 12.0)
    assert_equal(ti.max_wol(), 2.0)

    b.write(compression=3)
    f.close()
    f = st.open_file("postfile")
    bb = current.from_file(f, 0)

    bb.read_ids()
    assert_equal(list(bb.ids), [0, 1, 2, 5])
    bb.read_weights()
    assert_equal(list(bb.weights), [1.0, 2.0, 12.0, 6.5])
    bb.read_values()
    assert_equal(b.values, None)
    assert_equal(bb.min_length(), 1)
    assert_equal(bb.max_length(), byte_to_length(length_to_byte(420)))
    assert_equal(bb.max_weight(), 12.0)
    assert_equal(bb.max_wol(), 2.0)
Пример #12
0
    def from_string(cls, s):
        assert isinstance(s, bytes_type)

        if isinstance(s, string_type):
            hbyte = ord(s[0])  # Python 2.x - str
        else:
            hbyte = s[0]  # Python 3 - bytes

        if hbyte < 2:
            st = cls.struct
            # Weight, Doc freq, min len, max len, max w, unused, min ID, max ID
            w, df, ml, xl, xw, _, mid, xid = st.unpack(s[1:st.size + 1])
            mid = None if mid == NO_ID else mid
            xid = None if xid == NO_ID else xid
            # Postings
            pstr = s[st.size + 1:]
            if hbyte == 0:
                p = unpack_long(pstr)[0]
            else:
                p = loads(pstr + b("."))
        else:
            # Old format was encoded as a variable length pickled tuple
            v = loads(s + b("."))
            if len(v) == 1:
                w = df = 1
                p = v[0]
            elif len(v) == 2:
                w = df = v[1]
                p = v[0]
            else:
                w, p, df = v
            # Fake values for stats which weren't stored before
            ml = 1
            xl = 255
            xw = 999999999
            mid = -1
            xid = -1

        ml = byte_to_length(ml)
        xl = byte_to_length(xl)
        obj = cls(w, df, ml, xl, xw, mid, xid)
        obj.postings = p
        return obj
Пример #13
0
 def _minmax(self, fieldname, op, cache):
     if fieldname in cache:
         return cache[fieldname]
     else:
         ls = self.lengths[fieldname]
         if ls:
             result = byte_to_length(op(ls))
         else:
             result = 0
         cache[fieldname] = result
         return result
Пример #14
0
 def _minmax(self, fieldname, op, cache):
     if fieldname in cache:
         return cache[fieldname]
     else:
         ls = self.lengths[fieldname]
         if ls:
             result = byte_to_length(op(ls))
         else:
             result = 0
         cache[fieldname] = result
         return result
Пример #15
0
def test_lowlevel_block_writing():
    st = RamStorage()
    f = st.create_file("postfile")
    fpw = FilePostingWriter(f, blocklimit=4)
    fmt = formats.Frequency()
    fpw.start(fmt)
    fpw.write(0, 1.0, fmt.encode(1.0), 1)
    fpw.write(1, 2.0, fmt.encode(2.0), 2)
    fpw.write(2, 12.0, fmt.encode(12.0), 6)
    fpw.write(5, 6.5, fmt.encode(6.5), 420)

    fpw.write(11, 1.5, fmt.encode(1.5), 1)
    fpw.write(12, 2.5, fmt.encode(2.5), 2)
    fpw.write(26, 100.5, fmt.encode(100.5), 21)
    fpw.write(50, 8.0, fmt.encode(8.0), 1020)
    ti = fpw.finish()

    assert_equal(ti.weight(), 134.0)
    assert_equal(ti.doc_frequency(), 8)
    assert_equal(ti.min_length(), 1)
    assert_equal(ti.max_length(), byte_to_length(length_to_byte(1020)))
    assert_equal(ti.max_weight(), 100.5)
    assert_equal(ti.max_wol(), 100.5 / byte_to_length(length_to_byte(21)))
Пример #16
0
    def from_file(cls, postfile, postingsize, stringids=False):
        block = cls(postingsize, stringids=stringids)
        block.postfile = postfile

        delta = postfile.read_uint()
        block.nextoffset = postfile.tell() + delta
        info = postfile.read_pickle()
        block.dataoffset = postfile.tell()

        for key, value in zip(cls.infokeys, info):
            if key in ("minlength", "maxlength"):
                value = byte_to_length(value)
            setattr(block, key, value)

        return block
Пример #17
0
    def from_file(cls, postfile, postingsize, stringids=False):
        block = cls(postingsize, stringids=stringids)
        block.postfile = postfile

        delta = postfile.read_uint()
        block.nextoffset = postfile.tell() + delta
        info = postfile.read_pickle()
        block.dataoffset = postfile.tell()

        for key, value in zip(cls.infokeys, info):
            if key in ("minlength", "maxlength"):
                value = byte_to_length(value)
            setattr(block, key, value)

        return block
Пример #18
0
def test_many_lengths():
    domain = u("alfa bravo charlie delta echo foxtrot golf hotel").split()
    schema = fields.Schema(text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    for i, word in enumerate(domain):
        length = (i + 1) ** 6
        w.add_document(text=" ".join(word for _ in xrange(length)))
    w.commit()

    s = ix.searcher()
    for i, word in enumerate(domain):
        target = byte_to_length(length_to_byte((i + 1) ** 6))
        ti = s.term_info("text", word)
        assert_equal(ti.min_length(), target)
        assert_equal(ti.max_length(), target)
Пример #19
0
def test_many_lengths():
    domain = u("alfa bravo charlie delta echo foxtrot golf hotel").split()
    schema = fields.Schema(text=fields.TEXT)
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    for i, word in enumerate(domain):
        length = (i + 1) ** 6
        w.add_document(text=" ".join(word for _ in xrange(length)))
    w.commit()

    s = ix.searcher()
    for i, word in enumerate(domain):
        target = byte_to_length(length_to_byte((i + 1) ** 6))
        ti = s.term_info("text", word)
        assert_equal(ti.min_length(), target)
        assert_equal(ti.max_length(), target)
Пример #20
0
def test_lengths():
    s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True),
                      f2=fields.KEYWORD(stored=True, scorable=True))
    with TempIndex(s, "testlengths") as ix:
        w = ix.writer()
        tokens = u("ABCDEFG")
        from itertools import cycle, islice
        lengths = [10, 20, 2, 102, 45, 3, 420, 2]
        for length in lengths:
            w.add_document(f2=u(" ").join(islice(cycle(tokens), length)))
        w.commit()

        with ix.reader() as dr:
            ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))]
            assert_equal(ls1, [0] * len(lengths))
            ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))]
            assert_equal(ls2, [byte_to_length(length_to_byte(l))for l in lengths])
Пример #21
0
    def from_file(cls, postfile, stringids=False):
        pos = postfile.tell()
        block = cls(postfile, stringids=stringids)
        block.postfile = postfile
        header = cls._struct.unpack(postfile.read(cls._struct.size))
        block.nextoffset = pos + header[3]
        block.idslen = header[4]
        block.wtslen = header[5]
        block.count = header[6]
        block.maxweight = header[7]
        block.minlength = byte_to_length(header[10])

        if stringids:
            block.maxid = utf8decode(postfile.read_string())[0]
        else:
            block.maxid = postfile.read_uint()
        block.dataoffset = postfile.tell()
        return block
Пример #22
0
    def from_file(cls, postfile, stringids=False):
        pos = postfile.tell()
        block = cls(postfile, stringids=stringids)
        block.postfile = postfile
        header = cls._struct.unpack(postfile.read(cls._struct.size))
        block.nextoffset = pos + header[3]
        block.idslen = header[4]
        block.wtslen = header[5]
        block.count = header[6]
        block.maxweight = header[7]
        block.minlength = byte_to_length(header[10])

        if stringids:
            block.maxid = utf8decode(postfile.read_string())[0]
        else:
            block.maxid = postfile.read_uint()
        block.dataoffset = postfile.tell()
        return block
Пример #23
0
    def add_field_length(self, docnum, fieldname, length):
        self._fieldlength_totals[fieldname] += length

        bytelength = length_to_byte(length)
        normalized = byte_to_length(bytelength)

        if normalized < self._fieldlength_mins.get(fieldname, 999999999):
            self._fieldlength_mins[fieldname] = normalized

        if normalized > self._fieldlength_maxes.get(fieldname, 0):
            self._fieldlength_maxes[fieldname] = normalized

        if fieldname not in self.length_arrays:
            self.length_arrays[fieldname] = array("B")
        arry = self.length_arrays[fieldname]

        if len(arry) <= docnum:
            for _ in xrange(docnum - len(arry) + 1):
                arry.append(0)
        arry[docnum] = bytelength
Пример #24
0
def test_lengths():
    s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True),
                      f2=fields.KEYWORD(stored=True, scorable=True))
    with TempIndex(s, "testlengths") as ix:
        w = ix.writer()
        tokens = u("ABCDEFG")
        from itertools import cycle, islice
        lengths = [10, 20, 2, 102, 45, 3, 420, 2]
        for length in lengths:
            w.add_document(f2=u(" ").join(islice(cycle(tokens), length)))
        w.commit()

        with ix.reader() as dr:
            ls1 = [dr.doc_field_length(i, "f1")
                   for i in xrange(0, len(lengths))]
            assert_equal(ls1, [0] * len(lengths))
            ls2 = [dr.doc_field_length(i, "f2")
                   for i in xrange(0, len(lengths))]
            assert_equal(ls2, [byte_to_length(length_to_byte(l))
                               for l in lengths])
Пример #25
0
    def add_field_length(self, docnum, fieldname, length):
        self._fieldlength_totals[fieldname] += length

        bytelength = length_to_byte(length)
        normalized = byte_to_length(bytelength)

        if normalized < self._fieldlength_mins.get(fieldname, 999999999):
            self._fieldlength_mins[fieldname] = normalized

        if normalized > self._fieldlength_maxes.get(fieldname, 0):
            self._fieldlength_maxes[fieldname] = normalized

        if fieldname not in self.length_arrays:
            self.length_arrays[fieldname] = array("B")
        arry = self.length_arrays[fieldname]

        if len(arry) <= docnum:
            for _ in xrange(docnum - len(arry) + 1):
                arry.append(0)
        arry[docnum] = bytelength
Пример #26
0
    def append(self, id, weight, valuestring, dfl):
        self.ids.append(id)
        self.weights.append(weight)
        if weight > self._maxweight:
            self._maxweight = weight

        if valuestring:
            if self.values is None:
                self.values = []
            self.values.append(valuestring)

        if dfl:
            length_byte = length_to_byte(dfl)
            if self._minlength is None or length_byte < self._minlength:
                self._minlength = length_byte
            if dfl > self._maxlength:
                self._maxlength = length_byte
            wol = weight / byte_to_length(length_byte)
            if wol > self._maxwol:
                self._maxwol = wol
Пример #27
0
    def append(self, id, weight, valuestring, dfl):
        self.ids.append(id)
        self.weights.append(weight)
        if weight > self._maxweight:
            self._maxweight = weight

        if valuestring:
            if self.values is None:
                self.values = []
            self.values.append(valuestring)

        if dfl:
            length_byte = length_to_byte(dfl)
            if self._minlength is None or length_byte < self._minlength:
                self._minlength = length_byte
            if dfl > self._maxlength:
                self._maxlength = length_byte
            wol = weight / byte_to_length(length_byte)
            if wol > self._maxwol:
                self._maxwol = wol
Пример #28
0
    def from_file(cls, postfile, stringids=False):
        start = postfile.tell()
        block = cls(postfile, stringids=stringids)
        header = cls._struct.unpack(postfile.read(cls._struct.size))

        block.nextoffset = start + header[0]
        block.compression = header[1]
        block.postcount = header[2]
        block.typecode = header[3]
        block.idslen = header[5]
        block.weightslen = header[6]
        block.maxweight = header[7]
        block.maxwol = header[8]
        block.minlen = byte_to_length(header[10])

        if stringids:
            block.maxid = load(postfile)
        else:
            block.maxid = postfile.read_uint()

        block.dataoffset = postfile.tell()
        return block
Пример #29
0
    def from_file(cls, postfile, stringids=False):
        pos = postfile.tell()
        block = cls(postfile, stringids=stringids)

        encoded_header = postfile.read(cls._struct.size)
        header = cls._struct.unpack(encoded_header)
        (flags, _, _, nextoffset, block.idslen, block.weightslen,
         block.postcount, block.maxweight, block.maxwol, _, minlength) = header

        block.nextoffset = pos + nextoffset
        block.minlength = byte_to_length(minlength)

        assert block.postcount > 0, "postcount=%r" % block.postcount

        if stringids:
            block.maxid = utf8decode(postfile.read_string())[0]
        else:
            block.maxid = postfile.read_uint()

        block.dataoffset = postfile.tell()

        return block
Пример #30
0
 def from_file(cls, postfile, stringids=False):
     pos = postfile.tell()
     block = cls(postfile, stringids=stringids)
     
     encoded_header = postfile.read(cls._struct.size)
     header = cls._struct.unpack(encoded_header)
     (flags, _, _, nextoffset, block.idslen, block.weightslen,
      block.postcount, block.maxweight, block.maxwol, _, minlength) = header
     
     block.nextoffset = pos + nextoffset
     block.minlength = byte_to_length(minlength)
     
     assert block.postcount > 0, "postcount=%r" % block.postcount
     
     if stringids:
         block.maxid = utf8decode(postfile.read_string())[0]
     else:
         block.maxid = postfile.read_uint()
     
     block.dataoffset = postfile.tell()
     
     return block
Пример #31
0
 def from_file(cls, postfile, stringids=False):
     start = postfile.tell()
     block = cls(postfile, stringids=stringids)
     header = cls._struct.unpack(postfile.read(cls._struct.size))
     
     block.nextoffset = start + header[0]
     block.compression = header[1]
     block.postcount = header[2]
     block.typecode = header[3]
     block.idslen = header[5]
     block.weightslen = header[6]
     block.maxweight = header[7]
     block.maxwol = header[8]
     block.minlen = byte_to_length(header[10])
     
     if stringids:
         block.maxid = load(postfile)
     else:
         block.maxid = postfile.read_uint()
     
     block.dataoffset = postfile.tell()
     return block
Пример #32
0
 def blen(n):
     return byte_to_length(length_to_byte(n))
Пример #33
0
def test_length_byte():
    source = list(range(11))
    xform = [length_to_byte(n) for n in source]
    result = [byte_to_length(n) for n in xform]
    assert_equal(source, result)
Пример #34
0
 def get(self, docnum, fieldname, default=0):
     lengths = self.lengths
     if fieldname not in lengths:
         return default
     byte = lengths[fieldname][docnum] or default
     return byte_to_length(byte)
Пример #35
0
 def read_min_and_max_length(cls, dbfile, datapos):
     lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE
     ml = byte_to_length(dbfile.get_byte(lenpos))
     xl = byte_to_length(dbfile.get_byte(lenpos + 1))
     return ml, xl
Пример #36
0
 def get(self, docnum, fieldname, default=0):
     lengths = self.lengths
     if fieldname not in lengths:
         return default
     byte = lengths[fieldname][docnum] or default
     return byte_to_length(byte)
Пример #37
0
 def min_length(self):
     return byte_to_length(self._minlength)
Пример #38
0
 def max_length(self):
     return byte_to_length(self._maxlength)
Пример #39
0
 def doc_field_length(self, docnum, fieldname, default=0):
     try:
         start = self.starts[fieldname]
     except KeyError:
         return default
     return byte_to_length(self.dbfile.get_byte(start + docnum))
Пример #40
0
 def get(self, docnum, fieldname, default=0):
     try:
         start = self.starts[fieldname]
     except KeyError:
         return default
     return byte_to_length(self.dbfile.get_byte(start + docnum))
Пример #41
0
def test_length_byte():
    source = list(range(11))
    xform = [length_to_byte(n) for n in source]
    result = [byte_to_length(n) for n in xform]
    assert_equal(source, result)
Пример #42
0
def _discreet(length):
    return byte_to_length(length_to_byte(length))
Пример #43
0
def _discreet(length):
    return byte_to_length(length_to_byte(length))