Пример #1
0
 def first_id(self, fieldname, text):
     self._test_field(fieldname)
     format = self.format(fieldname)
     
     offset = self.termsindex[(fieldname, text)][1]
     if isinstance(offset, (int, long)):
         postreader = FilePostingReader(self.postfile, offset, format)
         return postreader.id()
     else:
         return offset[0][0]
def test_huge_postfile():
    with TempStorage("hugeindex") as st:
        pf = st.create_file("test.pst")

        gb5 = 5 * 1024 * 1024 * 1024
        pf.seek(gb5)
        pf.write("\x00\x00\x00\x00")
        assert_equal(pf.tell(), gb5 + 4)

        fpw = FilePostingWriter(pf)
        format = formats.Frequency(None)
        offset = fpw.start(format)
        for i in xrange(10):
            fpw.write(i, float(i), struct.pack("!I", i), 10)
        posttotal = fpw.finish()
        assert_equal(posttotal, 10)
        fpw.close()

        pf = st.open_file("test.pst")
        pfr = FilePostingReader(pf, offset, format)
        i = 0
        while pfr.is_active():
            assert_equal(pfr.id(), i)
            assert_equal(pfr.weight(), float(i))
            assert_equal(pfr.value(), struct.pack("!I", i))
            pfr.next()
            i += 1
        pf.close()
Пример #3
0
 def first_ids(self, fieldname):
     self._test_field(fieldname)
     format = self.format(fieldname)
     
     for (fn, t), (totalfreq, offset, postcount) in self.termsindex.items_from((fieldname, '')):
         if fn != fieldname:
             break
         
         if isinstance(offset, (int, long)):
             postreader = FilePostingReader(self.postfile, offset, format)
             id = postreader.id()
         else:
             id = offset[0][0]
         
         yield (t, id)
Пример #4
0
    def postings(self, fieldname, text, scorer=None):
        try:
            offset = self.termsindex[fieldname, text][1]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

        format = self.schema[fieldname].format
        if isinstance(offset, (int, long)):
            postreader = FilePostingReader(self.postfile,
                                           offset,
                                           format,
                                           scorer=scorer,
                                           fieldname=fieldname,
                                           text=text)
        else:
            docids, weights, values, maxwol, minlength = offset
            postreader = ListMatcher(docids,
                                     weights,
                                     values,
                                     format,
                                     scorer,
                                     maxwol=maxwol,
                                     minlength=minlength)

        deleted = self.segment.deleted
        if deleted:
            postreader = FilterMatcher(postreader, deleted, exclude=True)

        return postreader
Пример #5
0
    def postings(self, fieldname, text, exclude_docs=frozenset(), scorer=None):
        self._test_field(fieldname)
        format = self.format(fieldname)
        try:
            offset = self.termsindex[(fieldname, text)][1]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

        if self.segment.deleted and exclude_docs:
            exclude_docs = self.segment.deleted | exclude_docs
        elif self.segment.deleted:
            exclude_docs = self.segment.deleted

        if isinstance(offset, (int, long)):
            postreader = FilePostingReader(self.postfile, offset, format,
                                           scorer=scorer, fieldname=fieldname,
                                           text=text)
        else:
            docids, weights, values = offset
            postreader = ListMatcher(docids, weights, values, format, scorer)
        
        if exclude_docs:
            postreader = ExcludeMatcher(postreader, exclude_docs)
            
        return postreader
Пример #6
0
    def postings(self, fieldname, text, scorer=None):
        try:
            terminfo = self.termsindex[fieldname, text]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

        format = self.schema[fieldname].format
        postings = terminfo.postings
        if isinstance(postings, integer_types):
            postreader = FilePostingReader(self.postfile,
                                           postings,
                                           format,
                                           scorer=scorer,
                                           term=(fieldname, text))
        else:
            docids, weights, values = postings
            postreader = ListMatcher(docids,
                                     weights,
                                     values,
                                     format,
                                     scorer=scorer,
                                     term=(fieldname, text),
                                     terminfo=terminfo)

        deleted = self.segment.deleted
        if deleted:
            postreader = FilterMatcher(postreader, deleted, exclude=True)

        return postreader
Пример #7
0
def roundtrip(postings, format, astype):
    with TempStorage("roundtrip") as st:
        postfile = st.create_file(astype)
        getweight = format.decoder("weight")
        fpw = FilePostingWriter(postfile, blocklimit=8)
        fpw.start(format)
        for id, value in postings:
            v = format.encode(value)
            fpw.write(id, getweight(v), v, 0)
        fpw.finish()
        fpw.close()
        
        postfile = st.open_file(astype)
        fpr = FilePostingReader(postfile, 0, format)
        readback = list(fpr.items_as(astype))
        postfile.close()
        return readback
Пример #8
0
def test_readwrite():
    with TempStorage("readwrite") as st:
        format = Frequency()
        postings = make_postings()
        
        postfile = st.create_file("readwrite")
        fpw = FilePostingWriter(postfile, blocklimit=8)
        fpw.start(format)
        for id, freq in postings:
            fpw.write(id, float(freq), format.encode(freq), 0)
        fpw.finish()
        fpw.close()
        
        postfile = st.open_file("readwrite")
        fpr = FilePostingReader(postfile, 0, format)
        assert_equal(postings, list(fpr.items_as("frequency")))
        postfile.close()
Пример #9
0
    def roundtrip(self, postings, format, astype):
        postfile = self.make_file(astype)
        readback = None
        try:
            fpw = FilePostingWriter(postfile, blocklimit=8)
            fpw.start(format)
            for id, value in postings:
                fpw.write(id, format.encode(value))
            fpw.close()

            postfile = self.open_file(astype)
            fpr = FilePostingReader(postfile, 0, format)
            readback = list(fpr.all_as(astype))
            fpr.close()
        finally:
            self.delete_file(astype)
        return readback
Пример #10
0
def roundtrip(postings, format, astype):
    with TempStorage("roundtrip") as st:
        postfile = st.create_file(astype)
        getweight = format.decoder("weight")
        fpw = FilePostingWriter(postfile, blocklimit=8)
        fpw.start(format)
        for id, value in postings:
            v = format.encode(value)
            fpw.write(id, getweight(v), v, 0)
        fpw.finish()
        fpw.close()

        postfile = st.open_file(astype)
        fpr = FilePostingReader(postfile, 0, format)
        readback = list(fpr.items_as(astype))
        postfile.close()
        return readback
Пример #11
0
 def roundtrip(self, postings, format, astype):
     postfile = self.make_file(astype)
     readback = None
     try:
         fpw = FilePostingWriter(postfile, blocklimit=8)
         fpw.start(format)
         for id, value in postings:
             fpw.write(id, format.encode(value))
         fpw.close()
         
         postfile = self.open_file(astype)
         fpr = FilePostingReader(postfile, 0, format)
         readback = list(fpr.all_as(astype))
         fpr.close()
     finally:
         self.delete_file(astype)
     return readback
Пример #12
0
def test_readwrite():
    with TempStorage("readwrite") as st:
        format = Frequency()
        postings = make_postings()

        postfile = st.create_file("readwrite")
        fpw = FilePostingWriter(postfile, blocklimit=8)
        fpw.start(format)
        for id, freq in postings:
            fpw.write(id, float(freq), format.encode(freq), 0)
        fpw.finish()
        fpw.close()

        postfile = st.open_file("readwrite")
        fpr = FilePostingReader(postfile, 0, format)
        assert_equal(postings, list(fpr.items_as("frequency")))
        postfile.close()
Пример #13
0
 def test_readwrite(self):
     format = Frequency(None)
     postings = self.make_postings()
     
     postfile = self.make_file("readwrite")
     try:
         fpw = FilePostingWriter(postfile, blocklimit=8)
         fpw.start(format)
         for id, freq in postings:
             fpw.write(id, format.encode(freq))
         fpw.close()
         
         postfile = self.open_file("readwrite")
         fpr = FilePostingReader(postfile, 0, format)
         #self.assertEqual(postings, list(fpr.items_as("frequency")))
         fpr.close()
     finally:
         self.delete_file("readwrite")
Пример #14
0
    def test_readwrite(self):
        format = Frequency(None)
        postings = self.make_postings()

        postfile = self.make_file("readwrite")
        try:
            fpw = FilePostingWriter(postfile, blocklimit=8)
            fpw.start(format)
            for id, freq in postings:
                fpw.write(id, format.encode(freq))
            fpw.close()

            postfile = self.open_file("readwrite")
            fpr = FilePostingReader(postfile, 0, format)
            #self.assertEqual(postings, list(fpr.items_as("frequency")))
            fpr.close()
        finally:
            self.delete_file("readwrite")
Пример #15
0
    def vector(self, docnum, fieldid):
        self._open_vectors()
        schema = self.schema
        fieldnum = schema.to_number(fieldid)
        vformat = schema[fieldnum].vector

        offset = self.vectortable[(docnum, fieldnum)]
        return FilePostingReader(self.vpostfile,
                                 offset,
                                 vformat,
                                 stringids=True)
Пример #16
0
    def vector(self, docnum, fieldname):
        if fieldname not in self.schema:
            raise TermNotFound("No  field %r" % fieldname)
        vformat = self.schema[fieldname].vector
        if not vformat:
            raise Exception("No vectors are stored for field %r" % fieldname)

        self._open_vectors()
        offset = self.vectorindex.get((docnum, fieldname))
        if offset is None:
            raise Exception("No vector found for document"
                            " %s field %r" % (docnum, fieldname))

        return FilePostingReader(self.vpostfile, offset, vformat, stringids=True)
Пример #17
0
    def postings(self, fieldid, text, exclude_docs=None):
        schema = self.schema
        fieldnum = schema.to_number(fieldid)
        format = schema[fieldnum].format

        try:
            totalfreq, offset, postcount = self.termtable[(
                fieldnum, text)]  #@UnusedVariable
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldid, text))

        if self.segment.deleted and exclude_docs:
            exclude_docs = self.segment.deleted | exclude_docs
        elif self.segment.deleted:
            exclude_docs = self.segment.deleted

        postreader = FilePostingReader(self.postfile, offset, format)
        if exclude_docs:
            postreader = Exclude(postreader, exclude_docs)
        return postreader
Пример #18
0
    def postings(self, fieldid, text, exclude_docs=frozenset()):
        schema = self.schema
        fieldnum = schema.to_number(fieldid)
        format = schema[fieldnum].format

        try:
            totalfreq, offset, postcount = self.termtable[(
                fieldnum, text)]  #@UnusedVariable
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldid, text))

        if self.segment.deleted and exclude_docs:
            exclude_docs = self.segment.deleted | exclude_docs
        elif self.segment.deleted:
            exclude_docs = self.segment.deleted

        if not self.postfile:
            self.postfile = self.storage.open_file(self.segment.posts_filename,
                                                   mapped=False)
        postreader = FilePostingReader(self.postfile, offset, format)
        if exclude_docs:
            postreader = Exclude(postreader, exclude_docs)
        return postreader
Пример #19
0
    def postings(self, fieldname, text, exclude_docs=frozenset(), scorer=None):
        self._test_field(fieldname)
        format = self.format(fieldname)
        try:
            offset = self.termsindex[(fieldname, text)][1]
        except KeyError:
            raise TermNotFound("%s:%r" % (fieldname, text))

        if self.segment.deleted and exclude_docs:
            exclude_docs = self.segment.deleted | exclude_docs
        elif self.segment.deleted:
            exclude_docs = self.segment.deleted

        postreader = FilePostingReader(self.postfile,
                                       offset,
                                       format,
                                       scorer=scorer,
                                       fieldname=fieldname,
                                       text=text)
        if exclude_docs:
            postreader = ExcludeMatcher(postreader, exclude_docs)

        return postreader