def first_id(self, fieldname, text): self._test_field(fieldname) format = self.format(fieldname) offset = self.termsindex[(fieldname, text)][1] if isinstance(offset, (int, long)): postreader = FilePostingReader(self.postfile, offset, format) return postreader.id() else: return offset[0][0]
def test_huge_postfile(): with TempStorage("hugeindex") as st: pf = st.create_file("test.pst") gb5 = 5 * 1024 * 1024 * 1024 pf.seek(gb5) pf.write("\x00\x00\x00\x00") assert_equal(pf.tell(), gb5 + 4) fpw = FilePostingWriter(pf) format = formats.Frequency(None) offset = fpw.start(format) for i in xrange(10): fpw.write(i, float(i), struct.pack("!I", i), 10) posttotal = fpw.finish() assert_equal(posttotal, 10) fpw.close() pf = st.open_file("test.pst") pfr = FilePostingReader(pf, offset, format) i = 0 while pfr.is_active(): assert_equal(pfr.id(), i) assert_equal(pfr.weight(), float(i)) assert_equal(pfr.value(), struct.pack("!I", i)) pfr.next() i += 1 pf.close()
def first_ids(self, fieldname): self._test_field(fieldname) format = self.format(fieldname) for (fn, t), (totalfreq, offset, postcount) in self.termsindex.items_from((fieldname, '')): if fn != fieldname: break if isinstance(offset, (int, long)): postreader = FilePostingReader(self.postfile, offset, format) id = postreader.id() else: id = offset[0][0] yield (t, id)
def postings(self, fieldname, text, scorer=None): try: offset = self.termsindex[fieldname, text][1] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) format = self.schema[fieldname].format if isinstance(offset, (int, long)): postreader = FilePostingReader(self.postfile, offset, format, scorer=scorer, fieldname=fieldname, text=text) else: docids, weights, values, maxwol, minlength = offset postreader = ListMatcher(docids, weights, values, format, scorer, maxwol=maxwol, minlength=minlength) deleted = self.segment.deleted if deleted: postreader = FilterMatcher(postreader, deleted, exclude=True) return postreader
def postings(self, fieldname, text, exclude_docs=frozenset(), scorer=None): self._test_field(fieldname) format = self.format(fieldname) try: offset = self.termsindex[(fieldname, text)][1] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs elif self.segment.deleted: exclude_docs = self.segment.deleted if isinstance(offset, (int, long)): postreader = FilePostingReader(self.postfile, offset, format, scorer=scorer, fieldname=fieldname, text=text) else: docids, weights, values = offset postreader = ListMatcher(docids, weights, values, format, scorer) if exclude_docs: postreader = ExcludeMatcher(postreader, exclude_docs) return postreader
def postings(self, fieldname, text, scorer=None): try: terminfo = self.termsindex[fieldname, text] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) format = self.schema[fieldname].format postings = terminfo.postings if isinstance(postings, integer_types): postreader = FilePostingReader(self.postfile, postings, format, scorer=scorer, term=(fieldname, text)) else: docids, weights, values = postings postreader = ListMatcher(docids, weights, values, format, scorer=scorer, term=(fieldname, text), terminfo=terminfo) deleted = self.segment.deleted if deleted: postreader = FilterMatcher(postreader, deleted, exclude=True) return postreader
def roundtrip(postings, format, astype): with TempStorage("roundtrip") as st: postfile = st.create_file(astype) getweight = format.decoder("weight") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, value in postings: v = format.encode(value) fpw.write(id, getweight(v), v, 0) fpw.finish() fpw.close() postfile = st.open_file(astype) fpr = FilePostingReader(postfile, 0, format) readback = list(fpr.items_as(astype)) postfile.close() return readback
def test_readwrite(): with TempStorage("readwrite") as st: format = Frequency() postings = make_postings() postfile = st.create_file("readwrite") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, freq in postings: fpw.write(id, float(freq), format.encode(freq), 0) fpw.finish() fpw.close() postfile = st.open_file("readwrite") fpr = FilePostingReader(postfile, 0, format) assert_equal(postings, list(fpr.items_as("frequency"))) postfile.close()
def roundtrip(self, postings, format, astype): postfile = self.make_file(astype) readback = None try: fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, value in postings: fpw.write(id, format.encode(value)) fpw.close() postfile = self.open_file(astype) fpr = FilePostingReader(postfile, 0, format) readback = list(fpr.all_as(astype)) fpr.close() finally: self.delete_file(astype) return readback
def roundtrip(postings, format, astype): with TempStorage("roundtrip") as st: postfile = st.create_file(astype) getweight = format.decoder("weight") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, value in postings: v = format.encode(value) fpw.write(id, getweight(v), v, 0) fpw.finish() fpw.close() postfile = st.open_file(astype) fpr = FilePostingReader(postfile, 0, format) readback = list(fpr.items_as(astype)) postfile.close() return readback
def roundtrip(self, postings, format, astype): postfile = self.make_file(astype) readback = None try: fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, value in postings: fpw.write(id, format.encode(value)) fpw.close() postfile = self.open_file(astype) fpr = FilePostingReader(postfile, 0, format) readback = list(fpr.all_as(astype)) fpr.close() finally: self.delete_file(astype) return readback
def test_readwrite(): with TempStorage("readwrite") as st: format = Frequency() postings = make_postings() postfile = st.create_file("readwrite") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, freq in postings: fpw.write(id, float(freq), format.encode(freq), 0) fpw.finish() fpw.close() postfile = st.open_file("readwrite") fpr = FilePostingReader(postfile, 0, format) assert_equal(postings, list(fpr.items_as("frequency"))) postfile.close()
def test_readwrite(self): format = Frequency(None) postings = self.make_postings() postfile = self.make_file("readwrite") try: fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, freq in postings: fpw.write(id, format.encode(freq)) fpw.close() postfile = self.open_file("readwrite") fpr = FilePostingReader(postfile, 0, format) #self.assertEqual(postings, list(fpr.items_as("frequency"))) fpr.close() finally: self.delete_file("readwrite")
def test_readwrite(self): format = Frequency(None) postings = self.make_postings() postfile = self.make_file("readwrite") try: fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, freq in postings: fpw.write(id, format.encode(freq)) fpw.close() postfile = self.open_file("readwrite") fpr = FilePostingReader(postfile, 0, format) #self.assertEqual(postings, list(fpr.items_as("frequency"))) fpr.close() finally: self.delete_file("readwrite")
def vector(self, docnum, fieldid): self._open_vectors() schema = self.schema fieldnum = schema.to_number(fieldid) vformat = schema[fieldnum].vector offset = self.vectortable[(docnum, fieldnum)] return FilePostingReader(self.vpostfile, offset, vformat, stringids=True)
def vector(self, docnum, fieldname): if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) self._open_vectors() offset = self.vectorindex.get((docnum, fieldname)) if offset is None: raise Exception("No vector found for document" " %s field %r" % (docnum, fieldname)) return FilePostingReader(self.vpostfile, offset, vformat, stringids=True)
def postings(self, fieldid, text, exclude_docs=None): schema = self.schema fieldnum = schema.to_number(fieldid) format = schema[fieldnum].format try: totalfreq, offset, postcount = self.termtable[( fieldnum, text)] #@UnusedVariable except KeyError: raise TermNotFound("%s:%r" % (fieldid, text)) if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs elif self.segment.deleted: exclude_docs = self.segment.deleted postreader = FilePostingReader(self.postfile, offset, format) if exclude_docs: postreader = Exclude(postreader, exclude_docs) return postreader
def postings(self, fieldid, text, exclude_docs=frozenset()): schema = self.schema fieldnum = schema.to_number(fieldid) format = schema[fieldnum].format try: totalfreq, offset, postcount = self.termtable[( fieldnum, text)] #@UnusedVariable except KeyError: raise TermNotFound("%s:%r" % (fieldid, text)) if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs elif self.segment.deleted: exclude_docs = self.segment.deleted if not self.postfile: self.postfile = self.storage.open_file(self.segment.posts_filename, mapped=False) postreader = FilePostingReader(self.postfile, offset, format) if exclude_docs: postreader = Exclude(postreader, exclude_docs) return postreader
def postings(self, fieldname, text, exclude_docs=frozenset(), scorer=None): self._test_field(fieldname) format = self.format(fieldname) try: offset = self.termsindex[(fieldname, text)][1] except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) if self.segment.deleted and exclude_docs: exclude_docs = self.segment.deleted | exclude_docs elif self.segment.deleted: exclude_docs = self.segment.deleted postreader = FilePostingReader(self.postfile, offset, format, scorer=scorer, fieldname=fieldname, text=text) if exclude_docs: postreader = ExcludeMatcher(postreader, exclude_docs) return postreader