Exemplo n.º 1
0
def add_spelling(ix, fieldnames, commit=True):
    """Adds spelling files to an existing index that was created without
    them, and modifies the schema so the given fields have the ``spelling``
    attribute. Only works on filedb indexes.
    
    >>> ix = index.open_dir("testindex")
    >>> add_spelling(ix, ["content", "tags"])
    
    :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
    :param fieldnames: a list of field names to create word graphs for.
    :param force: if True, overwrites existing word graph files. This is only
        useful for debugging.
    """

    from whoosh.filedb.filereading import SegmentReader

    writer = ix.writer()
    storage = writer.storage
    schema = writer.schema
    segments = writer.segments

    for segment in segments:
        r = SegmentReader(storage, schema, segment)
        f = segment.create_file(storage, ".dag")
        dawg = DawgBuilder(f, field_root=True)
        for fieldname in fieldnames:
            ft = (fieldname,)
            for word in r.lexicon(fieldname):
                dawg.insert(ft + tuple(word))
        dawg.close()

    for fieldname in fieldnames:
        schema[fieldname].spelling = True

    if commit:
        writer.commit(merge=False)
Exemplo n.º 2
0
class StdFieldWriter(base.FieldWriter):
    def __init__(self,
                 storage,
                 segment,
                 blocklimit=128,
                 compression=3,
                 inlinelimit=1):
        self.storage = storage
        self.segment = segment
        self.fieldname = None
        self.text = None
        self.field = None
        self.format = None
        self.spelling = False

        tifile = segment.create_file(storage, StdCodec.TERMS_EXT)
        self.termsindex = TermIndexWriter(tifile)
        self.postfile = segment.create_file(storage, StdCodec.POSTS_EXT)

        # We'll wait to create the DAWG builder until someone actually adds
        # a spelled field
        self.dawg = None

        self.blocklimit = blocklimit
        self.compression = compression
        self.inlinelimit = inlinelimit
        self.block = None
        self.terminfo = None

    def _make_dawg_files(self):
        dawgfile = self.segment.create_file(self.storage, StdCodec.DAWG_EXT)
        self.dawg = DawgBuilder(dawgfile, field_root=True)

    def _reset_block(self):
        self.block = StdBlock(self.format.posting_size)

    def _write_block(self):
        self.terminfo.add_block(self.block)
        self.block.to_file(self.postfile, compression=self.compression)
        self._reset_block()
        self.blockcount += 1

    def _start_blocklist(self):
        postfile = self.postfile
        self._reset_block()

        # Magic number
        self.startoffset = postfile.tell()
        postfile.write(StdBlock.magic)
        # Placeholder for block count
        self.blockcount = 0
        postfile.write_uint(0)

    def _finish_blocklist(self):
        if self.block:
            self._write_block()

        # Seek back to the start of this list of posting blocks and write the
        # number of blocks
        postfile = self.postfile
        postfile.flush()
        here = postfile.tell()
        postfile.seek(self.startoffset + 4)
        postfile.write_uint(self.blockcount)
        postfile.seek(here)

        self.block = None

    def start_field(self, fieldname, fieldobj):
        self.fieldname = fieldname
        self.field = fieldobj
        self.format = fieldobj.format
        self.spelling = fieldobj.spelling and not fieldobj.separate_spelling()

    def start_term(self, text):
        if self.block is not None:
            raise Exception("Called start_term in a block")
        self.text = text
        self.terminfo = base.FileTermInfo()
        if self.spelling:
            if self.dawg is None:
                self._make_dawg_files()
            self.dawg.insert((self.fieldname, ) + tuple(text))
        self._start_blocklist()

    def add(self, docnum, weight, valuestring, length):
        self.block.add(docnum, weight, valuestring, length)
        if len(self.block) > self.blocklimit:
            self._write_block()

    def add_spell_word(self, fieldname, text):
        if self.dawg is None:
            self._make_dawg_files()
        self.dawg.insert((fieldname, ) + tuple(text))

    def finish_term(self):
        if self.block is None:
            raise Exception("Called finish_term when not in a block")
        block = self.block
        terminfo = self.terminfo
        if self.blockcount < 1 and block and len(block) < self.inlinelimit:
            # Inline the single block
            terminfo.add_block(block)
            vals = None if not block.values else tuple(block.values)
            postings = (tuple(block.ids), tuple(block.weights), vals)
        else:
            self._finish_blocklist()
            postings = self.startoffset

        self.block = None
        terminfo.postings = postings
        self.termsindex.add((self.fieldname, self.text), terminfo)

    def close(self):
        self.termsindex.close()
        self.postfile.close()
        if self.dawg is not None:
            self.dawg.close()
Exemplo n.º 3
0
class StdFieldWriter(base.FieldWriter):
    def __init__(self, storage, segment, blocklimit=128, compression=3,
                 inlinelimit=1):
        self.storage = storage
        self.segment = segment
        self.fieldname = None
        self.text = None
        self.field = None
        self.format = None
        self.spelling = False

        tifile = segment.create_file(storage, StdCodec.TERMS_EXT)
        self.termsindex = TermIndexWriter(tifile)
        self.postfile = segment.create_file(storage, StdCodec.POSTS_EXT)

        # We'll wait to create the DAWG builder until someone actually adds
        # a spelled field
        self.dawg = None

        self.blocklimit = blocklimit
        self.compression = compression
        self.inlinelimit = inlinelimit
        self.block = None
        self.terminfo = None

    def _make_dawg_files(self):
        dawgfile = self.segment.create_file(self.storage, StdCodec.DAWG_EXT)
        self.dawg = DawgBuilder(dawgfile, field_root=True)

    def _reset_block(self):
        self.block = StdBlock(self.format.posting_size)

    def _write_block(self):
        self.terminfo.add_block(self.block)
        self.block.to_file(self.postfile, compression=self.compression)
        self._reset_block()
        self.blockcount += 1

    def _start_blocklist(self):
        postfile = self.postfile
        self._reset_block()

        # Magic number
        self.startoffset = postfile.tell()
        postfile.write(StdBlock.magic)
        # Placeholder for block count
        self.blockcount = 0
        postfile.write_uint(0)

    def _finish_blocklist(self):
        if self.block:
            self._write_block()

        # Seek back to the start of this list of posting blocks and write the
        # number of blocks
        postfile = self.postfile
        postfile.flush()
        here = postfile.tell()
        postfile.seek(self.startoffset + 4)
        postfile.write_uint(self.blockcount)
        postfile.seek(here)

        self.block = None

    def start_field(self, fieldname, fieldobj):
        self.fieldname = fieldname
        self.field = fieldobj
        self.format = fieldobj.format
        self.spelling = fieldobj.spelling and not fieldobj.separate_spelling()

    def start_term(self, text):
        if self.block is not None:
            raise Exception("Called start_term in a block")
        self.text = text
        self.terminfo = base.FileTermInfo()
        if self.spelling:
            if self.dawg is None:
                self._make_dawg_files()
            self.dawg.insert((self.fieldname,) + tuple(text))
        self._start_blocklist()

    def add(self, docnum, weight, valuestring, length):
        self.block.add(docnum, weight, valuestring, length)
        if len(self.block) > self.blocklimit:
            self._write_block()

    def add_spell_word(self, fieldname, text):
        if self.dawg is None:
            self._make_dawg_files()
        self.dawg.insert((fieldname,) + tuple(text))

    def finish_term(self):
        if self.block is None:
            raise Exception("Called finish_term when not in a block")
        block = self.block
        terminfo = self.terminfo
        if self.blockcount < 1 and block and len(block) < self.inlinelimit:
            # Inline the single block
            terminfo.add_block(block)
            vals = None if not block.values else tuple(block.values)
            postings = (tuple(block.ids), tuple(block.weights), vals)
        else:
            self._finish_blocklist()
            postings = self.startoffset

        self.block = None
        terminfo.postings = postings
        self.termsindex.add((self.fieldname, self.text), terminfo)

    def close(self):
        self.termsindex.close()
        self.postfile.close()
        if self.dawg is not None:
            self.dawg.close()