Exemplo n.º 1
0
    def __init__(self, dbfile, offset, expand=True):
        self.id = offset
        self.dbfile = dbfile

        dbfile.seek(offset)
        flags = dbfile.read_byte()
        self.final = bool(flags & 1)
        self._edges = {}
        if flags & 2:
            singles = flags & 4
            bytes = flags & 8

            nkeys = dbfile.read_varint()

            ptrs = dbfile.read_array("I", nkeys)
            for i in xrange(nkeys):
                ptr = ptrs[i]
                if singles:
                    if bytes:
                        charnum = dbfile.read_byte()
                    else:
                        charnum = dbfile.read_ushort()
                    self._edges[unichr(charnum)] = ptr
                else:
                    key = utf8decode(dbfile.read_string())[0]
                    if len(key) > 1 and expand:
                        self._edges[key[0]] = PatNode(dbfile, key[1:], ptr)
                    else:
                        self._edges[key] = ptr
Exemplo n.º 2
0
    def from_file(file, stringids=False):
        here = file.tell()

        encoded_header = file.read(BlockInfo._struct.size)
        header = BlockInfo._struct.unpack(encoded_header)
        (flags, _, _, nextoffset, idslen, weightslen, postcount, maxweight,
         maxwol, _, minlength) = header

        if not flags:
            nextoffset = unpack_long(encoded_header[:8])
        else:
            nextoffset = here + nextoffset

        assert postcount > 0
        minlength = byte_to_length(minlength)

        if stringids:
            maxid = utf8decode(file.read_string())[0]
        else:
            maxid = file.read_uint()

        dataoffset = file.tell()
        return BlockInfo(flags=flags,
                         nextoffset=nextoffset,
                         postcount=postcount,
                         maxweight=maxweight,
                         maxwol=maxwol,
                         maxid=maxid,
                         minlength=minlength,
                         dataoffset=dataoffset,
                         idslen=idslen,
                         weightslen=weightslen)
Exemplo n.º 3
0
 def from_file(file, stringids=False):
     here = file.tell()
     
     encoded_header = file.read(BlockInfo._struct.size)
     header = BlockInfo._struct.unpack(encoded_header)
     (flags, _, _, nextoffset, idslen, weightslen, postcount, maxweight,
      maxwol, _, minlength) = header
     
     if not flags:
         nextoffset = unpack_long(encoded_header[:8])
     else:
         nextoffset = here + nextoffset
     
     assert postcount > 0
     minlength = byte_to_length(minlength)
     
     if stringids:
         maxid = utf8decode(file.read_string())[0]
     else:
         maxid = file.read_uint()
     
     dataoffset = file.tell()
     return BlockInfo(flags=flags, nextoffset=nextoffset,
                      postcount=postcount, maxweight=maxweight,
                      maxwol=maxwol, maxid=maxid, minlength=minlength,
                      dataoffset=dataoffset, idslen=idslen,
                      weightslen=weightslen)
Exemplo n.º 4
0
    def __init__(self, dbfile, offset, expand=True):
        self.id = offset
        self.dbfile = dbfile

        dbfile.seek(offset)
        flags = dbfile.read_byte()
        self.final = bool(flags & 1)
        self._edges = {}
        if flags & 2:
            singles = flags & 4
            bytes = flags & 8

            nkeys = dbfile.read_varint()

            ptrs = dbfile.read_array("I", nkeys)
            for i in xrange(nkeys):
                ptr = ptrs[i]
                if singles:
                    if bytes:
                        charnum = dbfile.read_byte()
                    else:
                        charnum = dbfile.read_ushort()
                    self._edges[unichr(charnum)] = ptr
                else:
                    key = utf8decode(dbfile.read_string())[0]
                    if len(key) > 1 and expand:
                        self._edges[key[0]] = PatNode(dbfile, key[1:], ptr)
                    else:
                        self._edges[key] = ptr
Exemplo n.º 5
0
    def _read_ids(self, offset, postcount):
        pf = self.postfile
        if self.stringids:
            pf.seek(offset)
            rs = pf.read_string
            ids = [utf8decode(rs())[0] for _ in xrange(postcount)]
            offset = pf.tell()
        else:
            ids = pf.get_array(offset, "I", postcount)
            offset += _INT_SIZE * postcount

        return (ids, offset)
Exemplo n.º 6
0
    def _read_ids(self, offset, postcount):
        pf = self.postfile
        if self.stringids:
            pf.seek(offset)
            rs = pf.read_string
            ids = [utf8decode(rs())[0] for _ in xrange(postcount)]
            offset = pf.tell()
        else:
            ids = pf.get_array(offset, "I", postcount)
            offset += _INT_SIZE * postcount

        return (ids, offset)
Exemplo n.º 7
0
    def _read_ids(self, offset, postcount, idslen):
        pf = self.postfile
        pf.seek(offset)
        
        if self.stringids:
            rs = pf.read_string
            ids = [utf8decode(rs())[0] for _ in xrange(postcount)]
            newoffset = pf.tell()
        elif idslen:
            ids = array("I")
            ids.fromstring(decompress(pf.read(idslen)))
            newoffset = offset + idslen
        else:
            ids = pf.read_array("I", postcount)
            newoffset = offset + _INT_SIZE * postcount

        return (ids, newoffset)
Exemplo n.º 8
0
def decode_posting(posting):
    """Decodes an encoded posting string into a
    (field_number, text, document_number, datastring) tuple.
    """

    fieldnum = unpack_ushort(posting[:_USHORT_SIZE])[0]

    zero = posting.find(chr(0), _USHORT_SIZE)
    text = utf8decode(posting[_USHORT_SIZE:zero])[0]

    metastart = zero + 1
    metaend = metastart + _INT_SIZE * 2
    doc, freq = unpack2ints(posting[metastart:metaend])

    datastring = posting[metaend:]

    return fieldnum, text, doc, freq, datastring
Exemplo n.º 9
0
def decode_posting(posting):
    """Decodes an encoded posting string into a
    (field_number, text, document_number, datastring) tuple.
    """

    fieldnum = unpack_ushort(posting[:_USHORT_SIZE])[0]

    zero = posting.find(chr(0), _USHORT_SIZE)
    text = utf8decode(posting[_USHORT_SIZE:zero])[0]

    metastart = zero + 1
    metaend = metastart + _INT_SIZE * 2
    doc, freq = unpack2ints(posting[metastart:metaend])

    datastring = posting[metaend:]

    return fieldnum, text, doc, freq, datastring
Exemplo n.º 10
0
    def _read_block_header(self, offset):
        pf = self.postfile
        if self.stringids:
            pf.seek(offset)
            maxid = utf8decode(pf.read_string())[0]
            offset = pf.tell()
        else:
            maxid = pf.get_uint(offset)
            offset = offset + _INT_SIZE

        nextoffset = pf.get_uint(offset)
        offset += _INT_SIZE

        postcount = pf.get_byte(offset)
        assert postcount > 0
        offset += 1

        return (maxid, nextoffset, postcount, offset)
Exemplo n.º 11
0
    def from_file(cls, postfile, stringids=False):
        pos = postfile.tell()
        block = cls(postfile, stringids=stringids)
        block.postfile = postfile
        header = cls._struct.unpack(postfile.read(cls._struct.size))
        block.nextoffset = pos + header[3]
        block.idslen = header[4]
        block.wtslen = header[5]
        block.count = header[6]
        block.maxweight = header[7]
        block.minlength = byte_to_length(header[10])

        if stringids:
            block.maxid = utf8decode(postfile.read_string())[0]
        else:
            block.maxid = postfile.read_uint()
        block.dataoffset = postfile.tell()
        return block
Exemplo n.º 12
0
    def from_file(cls, postfile, stringids=False):
        pos = postfile.tell()
        block = cls(postfile, stringids=stringids)
        block.postfile = postfile
        header = cls._struct.unpack(postfile.read(cls._struct.size))
        block.nextoffset = pos + header[3]
        block.idslen = header[4]
        block.wtslen = header[5]
        block.count = header[6]
        block.maxweight = header[7]
        block.minlength = byte_to_length(header[10])

        if stringids:
            block.maxid = utf8decode(postfile.read_string())[0]
        else:
            block.maxid = postfile.read_uint()
        block.dataoffset = postfile.tell()
        return block
Exemplo n.º 13
0
    def _read_block_header(self, offset):
        pf = self.postfile
        if self.stringids:
            pf.seek(offset)
            maxid = utf8decode(pf.read_string())[0]
            offset = pf.tell()
        else:
            maxid = pf.get_uint(offset)
            offset = offset + _INT_SIZE

        nextoffset = pf.get_uint(offset)
        offset += _INT_SIZE

        postcount = pf.get_byte(offset)
        assert postcount > 0
        offset += 1

        return (maxid, nextoffset, postcount, offset)
Exemplo n.º 14
0
    def from_file(cls, postfile, stringids=False):
        pos = postfile.tell()
        block = cls(postfile, stringids=stringids)

        encoded_header = postfile.read(cls._struct.size)
        header = cls._struct.unpack(encoded_header)
        (flags, _, _, nextoffset, block.idslen, block.weightslen,
         block.postcount, block.maxweight, block.maxwol, _, minlength) = header

        block.nextoffset = pos + nextoffset
        block.minlength = byte_to_length(minlength)

        assert block.postcount > 0, "postcount=%r" % block.postcount

        if stringids:
            block.maxid = utf8decode(postfile.read_string())[0]
        else:
            block.maxid = postfile.read_uint()

        block.dataoffset = postfile.tell()

        return block
Exemplo n.º 15
0
 def from_file(cls, postfile, stringids=False):
     pos = postfile.tell()
     block = cls(postfile, stringids=stringids)
     
     encoded_header = postfile.read(cls._struct.size)
     header = cls._struct.unpack(encoded_header)
     (flags, _, _, nextoffset, block.idslen, block.weightslen,
      block.postcount, block.maxweight, block.maxwol, _, minlength) = header
     
     block.nextoffset = pos + nextoffset
     block.minlength = byte_to_length(minlength)
     
     assert block.postcount > 0, "postcount=%r" % block.postcount
     
     if stringids:
         block.maxid = utf8decode(postfile.read_string())[0]
     else:
         block.maxid = postfile.read_uint()
     
     block.dataoffset = postfile.tell()
     
     return block
Exemplo n.º 16
0
    def read_ids(self):
        postfile = self.postfile
        offset = self.dataoffset
        postcount = self.count
        postfile.seek(offset)

        if self.stringids:
            rs = postfile.read_string
            ids = [utf8decode(rs())[0] for _ in xrange(postcount)]
            newoffset = postfile.tell()
        elif self.idslen:
            ids = array("I")
            array_frombytes(ids, decompress(postfile.read(self.idslen)))
            if IS_LITTLE:
                ids.byteswap()
            newoffset = offset + self.idslen
        else:
            ids = postfile.read_array("I", postcount)
            newoffset = offset + _INT_SIZE * postcount

        self.ids = ids
        self.weights_offset = newoffset
        return ids
Exemplo n.º 17
0
    def read_ids(self):
        postfile = self.postfile
        offset = self.dataoffset
        postcount = self.postcount
        postfile.seek(offset)

        if self.stringids:
            rs = postfile.read_string
            ids = [utf8decode(rs())[0] for _ in xrange(postcount)]
            newoffset = postfile.tell()
        elif self.idslen:
            ids = array("I")
            ids.fromstring(decompress(postfile.read(self.idslen)))
            if IS_LITTLE:
                ids.byteswap()
            newoffset = offset + self.idslen
        else:
            ids = postfile.read_array("I", postcount)
            newoffset = offset + _INT_SIZE * postcount

        self.ids = ids
        self.weights_offset = newoffset
        return ids
Exemplo n.º 18
0
 def keydecoder(self, v):
     assert isinstance(v, bytes_type)
     return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
Exemplo n.º 19
0
def within(graph, text, k=1, prefix=0, address=None):
    """Yields a series of keys in the given graph within ``k`` edit distance of
    ``text``. If ``prefix`` is greater than 0, all keys must match the first
    ``prefix`` characters of ``text``.
    """

    text = to_labels(text)
    if address is None:
        address = graph._root

    sofar = emptybytes
    accept = False
    if prefix:
        prefixchars = text[:prefix]
        arc = graph.find_path(prefixchars, address=address)
        if arc is None:
            return
        sofar = emptybytes.join(prefixchars)
        address = arc.target
        accept = arc.accept

    stack = [(address, k, prefix, sofar, accept)]
    seen = set()
    while stack:
        state = stack.pop()
        # Have we already tried this state?
        if state in seen:
            continue
        seen.add(state)

        address, k, i, sofar, accept = state
        # If we're at the end of the text (or deleting enough chars would get
        # us to the end and still within K), and we're in the accept state,
        # yield the current result
        if (len(text) - i <= k) and accept:
            yield utf8decode(sofar)[0]

        # If we're in the stop state, give up
        if address is None:
            continue

        # Exact match
        if i < len(text):
            arc = graph.find_arc(address, text[i])
            if arc:
                stack.append((arc.target, k, i + 1, sofar + text[i],
                              arc.accept))
        # If K is already 0, can't do any more edits
        if k < 1:
            continue
        k -= 1

        arcs = graph.arc_dict(address)
        # Insertions
        stack.extend((arc.target, k, i, sofar + char, arc.accept)
                     for char, arc in iteritems(arcs))

        # Deletion, replacement, and transpo only work before the end
        if i >= len(text):
            continue
        char = text[i]

        # Deletion
        stack.append((address, k, i + 1, sofar, False))
        # Replacement
        for char2, arc in iteritems(arcs):
            if char2 != char:
                stack.append((arc.target, k, i + 1, sofar + char2, arc.accept))
        # Transposition
        if i < len(text) - 1:
            char2 = text[i + 1]
            if char != char2 and char2 in arcs:
                # Find arc from next char to this char
                target = arcs[char2].target
                if target:
                    arc = graph.find_arc(target, char)
                    if arc:
                        stack.append((arc.target, k, i + 2,
                                      sofar + char2 + char, arc.accept))
Exemplo n.º 20
0
    def prefix_string(self):
        """Returns the labels of the path from the root to the current arc as
        a decoded unicode string.
        """

        return utf8decode(self.prefix_bytes())[0]
Exemplo n.º 21
0
    def peek_key_string(self):
        """Returns the next closest key in the graph as a decoded unicode
        string.
        """

        return utf8decode(self.peek_key_bytes())[0]
Exemplo n.º 22
0
 def keydecoder(self, v):
     return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
Exemplo n.º 23
0
 def flatten_strings(self):
     return (utf8decode(k)[0] for k in self.flatten())
Exemplo n.º 24
0
 def keydecoder(self, v):
     if isinstance(v, text_type):
         v = v.encode('latin-1')
     return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
Exemplo n.º 25
0
 def keydecoder(self, v):
     return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
Exemplo n.º 26
0
    def peek_key_string(self):
        """Returns the next closest key in the graph as a decoded unicode
        string.
        """

        return utf8decode(self.peek_key_bytes())[0]
Exemplo n.º 27
0
def decode_termkey(key):
    return unpackushort(key[:_USHORT_SIZE]), utf8decode(key[_USHORT_SIZE:])[0]
Exemplo n.º 28
0
 def flatten_strings(self):
     return (utf8decode(k)[0] for k in self.flatten())
Exemplo n.º 29
0
def within(graph, text, k=1, prefix=0, address=None):
    """Yields a series of keys in the given graph within ``k`` edit distance of
    ``text``. If ``prefix`` is greater than 0, all keys must match the first
    ``prefix`` characters of ``text``.
    """

    text = to_labels(text)
    if address is None:
        address = graph._root

    sofar = emptybytes
    accept = False
    if prefix:
        prefixchars = text[:prefix]
        arc = graph.find_path(prefixchars, address=address)
        if arc is None:
            return
        sofar = emptybytes.join(prefixchars)
        address = arc.target
        accept = arc.accept

    stack = [(address, k, prefix, sofar, accept)]
    seen = set()
    while stack:
        state = stack.pop()
        # Have we already tried this state?
        if state in seen:
            continue
        seen.add(state)

        address, k, i, sofar, accept = state
        # If we're at the end of the text (or deleting enough chars would get
        # us to the end and still within K), and we're in the accept state,
        # yield the current result
        if (len(text) - i <= k) and accept:
            yield utf8decode(sofar)[0]

        # If we're in the stop state, give up
        if address is None:
            continue

        # Exact match
        if i < len(text):
            arc = graph.find_arc(address, text[i])
            if arc:
                stack.append(
                    (arc.target, k, i + 1, sofar + text[i], arc.accept))
        # If K is already 0, can't do any more edits
        if k < 1:
            continue
        k -= 1

        arcs = graph.arc_dict(address)
        # Insertions
        stack.extend((arc.target, k, i, sofar + char, arc.accept)
                     for char, arc in iteritems(arcs))

        # Deletion, replacement, and transpo only work before the end
        if i >= len(text):
            continue
        char = text[i]

        # Deletion
        stack.append((address, k, i + 1, sofar, False))
        # Replacement
        for char2, arc in iteritems(arcs):
            if char2 != char:
                stack.append((arc.target, k, i + 1, sofar + char2, arc.accept))
        # Transposition
        if i < len(text) - 1:
            char2 = text[i + 1]
            if char != char2 and char2 in arcs:
                # Find arc from next char to this char
                target = arcs[char2].target
                if target:
                    arc = graph.find_arc(target, char)
                    if arc:
                        stack.append((arc.target, k, i + 2,
                                      sofar + char2 + char, arc.accept))
Exemplo n.º 30
0
 def keydecoder(self, v):
     assert isinstance(v, bytes_type)
     return (self.names[unpack_ushort(v[:2])[0]], utf8decode(v[2:])[0])
Exemplo n.º 31
0
def decode_termkey(key):
    return unpackushort(key[:_USHORT_SIZE]), utf8decode(key[_USHORT_SIZE:])[0]
Exemplo n.º 32
0
    def prefix_string(self):
        """Returns the labels of the path from the root to the current arc as
        a decoded unicode string.
        """

        return utf8decode(self.prefix_bytes())[0]