def _rt(c, values, default): # Continuous st = RamStorage() f = st.create_file("test1") f.write(b("hello")) w = c.writer(f) for docnum, v in enumerate(values): w.add(docnum, v) w.finish(len(values)) length = f.tell() - 5 f.close() f = st.open_file("test1") r = c.reader(f, 5, length, len(values)) assert values == list(r) for x in range(len(values)): assert values[x] == r[x] f.close() # Sparse doccount = len(values) * 7 + 15 target = [default] * doccount f = st.create_file("test2") f.write(b("hello")) w = c.writer(f) for docnum, v in izip(xrange(10, doccount, 7), values): target[docnum] = v w.add(docnum, v) w.finish(doccount) length = f.tell() - 5 f.close() f = st.open_file("test2") r = c.reader(f, 5, length, doccount) assert target == list(r) for x in range(doccount): assert target[x] == r[x] lr = r.load() assert target == list(lr) f.close()
def key_to_name(self, key): return tuple( catter.key_to_name(keypart) for catter, keypart in izip(self.catters, key))
def __eq__(self, other): for a, b in izip(self, other): if a != b: return False return True
def key_to_name(self, key): return tuple(catter.key_to_name(keypart) for catter, keypart in izip(self.catters, key))
def all_doc_ids(self): for r, offset in izip(self._readers, self._doc_offsets): for docnum in r.all_doc_ids(): yield docnum + offset
def deleted_docs(self): for r, offset in izip(self._readers, self._doc_offsets): for docnum in r.deleted_docs(): yield docnum + offset
def __iter__(self): for idset, offset in izip(self.idsets, self.offsets): for docnum in idset: yield docnum + offset
def all_items(self): values = self._values if values is None: values = repeat('') return izip(self._ids, values)
def _do_basic(writerclass): # Create the domain data # List of individual words added to the index words = [] # List of string values added to the index docs = [] # A ring buffer for creating string values buf = deque() for ls in permutations(u("abcd")): word = "".join(ls) # Remember this word is in the index (to check lexicon) words.append(word) # Add this word on to the end, pop the first word off to create N word # documents where N <= 10 buf.append(word) if len(buf) > 10: buf.popleft() # Create a copy of the buffer and shuffle it to create a document value # and add it to the list of document values doc = list(buf) random.shuffle(doc) docs.append(" ".join(doc)) # Shuffle the list of document values random.shuffle(docs) schema = fields.Schema(text=fields.TEXT(stored=True, spelling=True, vector=True), row=fields.NUMERIC(stored=True)) with TempIndex(schema, storage_debug=True) as ix: # Add the domain data to the index with writerclass(ix, procs=3) as w: for i, value in enumerate(docs): w.add_document(text=value, row=i) with ix.searcher() as s: r = s.reader() # Check the lexicon for word, term in izip(words, r.field_terms("text")): assert word == term # Check the doc count assert r.doc_count_all() == len(docs) # Check the word graph assert r.has_word_graph("text") flat = [w.decode("latin1") for w in r.word_graph("text").flatten()] assert flat == words # Check there are lengths total = sum(r.doc_field_length(docnum, "text", 0) for docnum in xrange(r.doc_count_all())) assert total > 0 # Check per-doc info for i, value in enumerate(docs): pieces = value.split() docnum = s.document_number(row=i) # Check stored value sv = r.stored_fields(docnum) assert sv["text"] == value # Check vectors vr = r.vector(docnum, "text") # Get the terms and positions from the vector matcher iv = list(vr.items_as("positions")) # What the vector should look like ov = sorted((text, [i]) for i, text in enumerate(pieces)) assert iv == ov # Check field length assert r.doc_field_length(docnum, "text") == len(pieces)