예제 #1
0
    def __init__(self, ix, postlimit, blocklimit, name=None):
        """
        :param ix: the Index object in which to write the new segment.
        :param postlimit: the maximum size for a run in the posting pool.
        :param blocklimit: the maximum number of postings in a posting block.
        :param name: the name of the segment.
        """

        self.index = ix
        self.schema = ix.schema
        self.storage = storage = ix.storage
        self.name = name or ix._next_segment_name()

        self.max_doc = 0

        self.pool = postpool.PostingPool(postlimit)

        # Create mappings of field numbers to the position of that field in the
        # lists of scorable and stored fields. For example, consider a schema
        # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the
        # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary
        # would then map B -> 0, D -> 1, and E -> 2.
        self._scorable_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields()))
        self._stored_to_pos = dict(
            (fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))

        # Create a temporary segment object just so we can access its
        # *_filename attributes (so if we want to change the naming convention,
        # we only have to do it in one place).
        tempseg = Segment(self.name, 0, 0, None)
        self.termtable = create_terms(storage, tempseg)
        self.docslist = create_storedfields(storage, tempseg)
        self.doclengths = None
        if self.schema.scorable_fields():
            self.doclengths = create_doclengths(storage, tempseg,
                                                len(self._scorable_to_pos))

        postfile = storage.create_file(tempseg.posts_filename)
        self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit)

        self.vectortable = None
        if self.schema.has_vectored_fields():
            # Table associating document fields with (postoffset, postcount)
            self.vectortable = create_vectors(storage, tempseg)
            vpostfile = storage.create_file(tempseg.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpostfile, stringids=True)

        # Keep track of the total number of tokens (across all docs)
        # in each field
        self.field_length_totals = defaultdict(int)
def test_huge_postfile():
    with TempStorage("hugeindex") as st:
        pf = st.create_file("test.pst")

        gb5 = 5 * 1024 * 1024 * 1024
        pf.seek(gb5)
        pf.write("\x00\x00\x00\x00")
        assert_equal(pf.tell(), gb5 + 4)

        fpw = FilePostingWriter(pf)
        format = formats.Frequency(None)
        offset = fpw.start(format)
        for i in xrange(10):
            fpw.write(i, float(i), struct.pack("!I", i), 10)
        posttotal = fpw.finish()
        assert_equal(posttotal, 10)
        fpw.close()

        pf = st.open_file("test.pst")
        pfr = FilePostingReader(pf, offset, format)
        i = 0
        while pfr.is_active():
            assert_equal(pfr.id(), i)
            assert_equal(pfr.weight(), float(i))
            assert_equal(pfr.value(), struct.pack("!I", i))
            pfr.next()
            i += 1
        pf.close()
예제 #3
0
    def roundtrip(self, postings, format, astype):
        postfile = self.make_file(astype)
        readback = None
        try:
            fpw = FilePostingWriter(postfile, blocklimit=8)
            fpw.start(format)
            for id, value in postings:
                fpw.write(id, format.encode(value))
            fpw.close()

            postfile = self.open_file(astype)
            fpr = FilePostingReader(postfile, 0, format)
            readback = list(fpr.all_as(astype))
            fpr.close()
        finally:
            self.delete_file(astype)
        return readback
예제 #4
0
def roundtrip(postings, format, astype):
    with TempStorage("roundtrip") as st:
        postfile = st.create_file(astype)
        getweight = format.decoder("weight")
        fpw = FilePostingWriter(postfile, blocklimit=8)
        fpw.start(format)
        for id, value in postings:
            v = format.encode(value)
            fpw.write(id, getweight(v), v, 0)
        fpw.finish()
        fpw.close()

        postfile = st.open_file(astype)
        fpr = FilePostingReader(postfile, 0, format)
        readback = list(fpr.items_as(astype))
        postfile.close()
        return readback
예제 #5
0
def test_readwrite():
    with TempStorage("readwrite") as st:
        format = Frequency()
        postings = make_postings()

        postfile = st.create_file("readwrite")
        fpw = FilePostingWriter(postfile, blocklimit=8)
        fpw.start(format)
        for id, freq in postings:
            fpw.write(id, float(freq), format.encode(freq), 0)
        fpw.finish()
        fpw.close()

        postfile = st.open_file("readwrite")
        fpr = FilePostingReader(postfile, 0, format)
        assert_equal(postings, list(fpr.items_as("frequency")))
        postfile.close()
예제 #6
0
    def test_readwrite(self):
        format = Frequency(None)
        postings = self.make_postings()

        postfile = self.make_file("readwrite")
        try:
            fpw = FilePostingWriter(postfile, blocklimit=8)
            fpw.start(format)
            for id, freq in postings:
                fpw.write(id, format.encode(freq))
            fpw.close()

            postfile = self.open_file("readwrite")
            fpr = FilePostingReader(postfile, 0, format)
            #self.assertEqual(postings, list(fpr.items_as("frequency")))
            fpr.close()
        finally:
            self.delete_file("readwrite")
예제 #7
0
def test_lowlevel_block_writing():
    st = RamStorage()
    f = st.create_file("postfile")
    fpw = FilePostingWriter(f, blocklimit=4)
    fmt = formats.Frequency()
    fpw.start(fmt)
    fpw.write(0, 1.0, fmt.encode(1.0), 1)
    fpw.write(1, 2.0, fmt.encode(2.0), 2)
    fpw.write(2, 12.0, fmt.encode(12.0), 6)
    fpw.write(5, 6.5, fmt.encode(6.5), 420)

    fpw.write(11, 1.5, fmt.encode(1.5), 1)
    fpw.write(12, 2.5, fmt.encode(2.5), 2)
    fpw.write(26, 100.5, fmt.encode(100.5), 21)
    fpw.write(50, 8.0, fmt.encode(8.0), 1020)
    ti = fpw.finish()

    assert_equal(ti.weight(), 134.0)
    assert_equal(ti.doc_frequency(), 8)
    assert_equal(ti.min_length(), 1)
    assert_equal(ti.max_length(), byte_to_length(length_to_byte(1020)))
    assert_equal(ti.max_weight(), 100.5)
    assert_equal(ti.max_wol(), 100.5 / byte_to_length(length_to_byte(21)))
예제 #8
0
    def __init__(self, ix, poolclass=None, procs=0, blocklimit=128,
                 timeout=0.0, delay=0.1, name=None, _l=True, **poolargs):

        self.writelock = None
        if _l:
            self.writelock = ix.lock("WRITELOCK")
            if not try_for(self.writelock.acquire, timeout=timeout, delay=delay):
                raise LockError
        self.readlock = ix.lock("READLOCK")

        info = ix._read_toc()
        self.schema = info.schema
        self.segments = info.segments
        self.storage = ix.storage
        self.indexname = ix.indexname
        self.is_closed = False

        self.blocklimit = blocklimit
        self.segment_number = info.segment_counter + 1
        self.generation = info.generation + 1

        self._doc_offsets = []
        base = 0
        for s in self.segments:
            self._doc_offsets.append(base)
            base += s.doc_count_all()

        self.name = name or Segment.basename(self.indexname, self.segment_number)
        self.docnum = 0
        self.fieldlength_totals = defaultdict(int)
        self._added = False
        self._unique_cache = {}

        # Create a temporary segment to use its .*_filename attributes
        segment = Segment(self.name, self.generation, 0, None, None)

        # Terms index
        tf = self.storage.create_file(segment.termsindex_filename)
        ti = TermIndexWriter(tf)
        # Term postings file
        pf = self.storage.create_file(segment.termposts_filename)
        pw = FilePostingWriter(pf, blocklimit=blocklimit)
        # Terms writer
        self.termswriter = TermsWriter(self.schema, ti, pw)

        if self.schema.has_vectored_fields():
            # Vector index
            vf = self.storage.create_file(segment.vectorindex_filename)
            self.vectorindex = TermVectorWriter(vf)

            # Vector posting file
            vpf = self.storage.create_file(segment.vectorposts_filename)
            self.vpostwriter = FilePostingWriter(vpf, stringids=True)
        else:
            self.vectorindex = None
            self.vpostwriter = None

        # Stored fields file
        sf = self.storage.create_file(segment.storedfields_filename)
        self.storedfields = StoredFieldWriter(sf, self.schema.stored_names())

        # Field lengths file
        self.lengthfile = self.storage.create_file(segment.fieldlengths_filename)

        # Create the pool
        if poolclass is None:
            if procs > 1:
                from whoosh.filedb.multiproc import MultiPool
                poolclass = MultiPool
            else:
                poolclass = TempfilePool
        self.pool = poolclass(self.schema, procs=procs, **poolargs)