def test_compatibility(): from whoosh.scoring import Weighting # This is the old way of doing a custom weighting model, check that # it's still supported... class LegacyWeighting(Weighting): use_final = True def score(self, searcher, fieldname, text, docnum, weight): return weight + 0.5 def final(self, searcher, docnum, score): return score * 1.5 schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() domain = "alfa bravo charlie delta".split() for ls in permutations(domain, 3): w.add_document(text=u(" ").join(ls)) w.commit() s = ix.searcher(weighting=LegacyWeighting()) r = s.search(query.Term("text", u("bravo"))) assert r.score(0) == 2.25
def test_boost_phrase(): schema = fields.Schema(title=fields.TEXT(field_boost=5.0, stored=True), text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta").split() w = ix.writer() for ls in permutations(domain): t = u(" ").join(ls) w.add_document(title=t, text=t) w.commit() q = query.Or([query.Term("title", u("alfa")), query.Term("title", u("bravo")), query.Phrase("text", [u("bravo"), u("charlie"), u("delta")]) ]) def boost_phrases(q): if isinstance(q, query.Phrase): q.boost *= 1000.0 return q else: return q.apply(boost_phrases) q = boost_phrases(q) with ix.searcher() as s: r = s.search(q, limit=None) for hit in r: if "bravo charlie delta" in hit["title"]: assert hit.score > 100.0
def test_excludematcher(): schema = fields.Schema(content=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) domain = ("alfa", "bravo", "charlie", "delta") for _ in xrange(3): w = ix.writer() for ls in permutations(domain): w.add_document(content=u(" ").join(ls)) w.commit(merge=False) w = ix.writer() w.delete_document(5) w.delete_document(10) w.delete_document(28) w.commit(merge=False) q = Term("content", "bravo") with ix.searcher() as s: m = q.matcher(s) while m.is_active(): content = s.stored_fields(m.id())["content"].split() spans = m.spans() for span in spans: assert content[span.start] == "bravo" m.next()
def test_deleteall(): schema = fields.Schema(text=fields.TEXT) with TempIndex(schema, "deleteall") as ix: w = ix.writer() domain = u("alfa bravo charlie delta echo").split() for i, ls in enumerate(permutations(domain)): w.add_document(text=u(" ").join(ls)) if not i % 10: w.commit() w = ix.writer() w.commit() # This is just a test, don't use this method to delete all docs IRL! doccount = ix.doc_count_all() w = ix.writer() for docnum in xrange(doccount): w.delete_document(docnum) w.commit() with ix.searcher() as s: r = s.search(query.Or([query.Term("text", u("alfa")), query.Term("text", u("bravo"))])) assert len(r) == 0 ix.optimize() assert ix.doc_count_all() == 0 with ix.reader() as r: assert list(r) == []
def test_deleteall(): schema = fields.Schema(text=fields.TEXT) with TempIndex(schema, "deleteall") as ix: w = ix.writer() domain = u("alfa bravo charlie delta echo").split() for i, ls in enumerate(permutations(domain)): w.add_document(text=u(" ").join(ls)) if not i % 10: w.commit() w = ix.writer() w.commit() # This is just a test, don't use this method to delete all docs IRL! doccount = ix.doc_count_all() w = ix.writer() for docnum in xrange(doccount): w.delete_document(docnum) w.commit() with ix.searcher() as s: r = s.search( query.Or([ query.Term("text", u("alfa")), query.Term("text", u("bravo")) ])) assert len(r) == 0 ix.optimize() assert ix.doc_count_all() == 0 with ix.reader() as r: assert list(r) == []
def test_spelling_field_order(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT, b=fields.TEXT(analyzer=ana), c=fields.TEXT, d=fields.TEXT(analyzer=ana), e=fields.TEXT(analyzer=ana), f=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta").split() w = ix.writer() for ls in permutations(domain): value = " ".join(ls) w.add_document(a=value, b=value, c=value, d=value, e=value, f=value) w.commit()
def test_strings_dfa(): strings = "able alfa alpha apple bar bear beat boom boot".split() dfa = fsa.strings_dfa(strings) output = list(dfa.generate_all()) assert output == strings domain = "abcd" words = set() for i in xrange(1, len(domain) + 1): words.update("".join(p) for p in permutations(domain[:i])) words = sorted(words) dfa = fsa.strings_dfa(words) assert list(dfa.generate_all()) == words
def test_filtered_grouped(): schema = fields.Schema(tag=fields.ID, text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta echo foxtrot").split() with ix.writer() as w: for i, ls in enumerate(permutations(domain, 3)): tag = u(str(i % 3)) w.add_document(tag=tag, text=u(" ").join(ls)) with ix.searcher() as s: f = query.And([query.Term("text", "charlie"), query.Term("text", "delta")]) r = s.search(query.Every(), filter=f, groupedby="tag", limit=None) assert len(r) == 24
def test_current_terms(): domain = u("alfa bravo charlie delta").split() schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() for ls in permutations(domain, 3): w.add_document(text=" ".join(ls), _stored_text=ls) w.commit() with ix.searcher() as s: q = query.And([query.Term("text", "alfa"), query.Term("text", "charlie")]) m = q.matcher(s) while m.is_active(): assert sorted(m.matching_terms()) == [("text", b("alfa")), ("text", b("charlie"))] m.next()
def test_stability(): schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta").split() w = ix.writer() for ls in permutations(domain, 3): w.add_document(text=u(" ").join(ls)) w.commit() with ix.searcher() as s: q = query.Term("text", u("bravo")) last = [] for i in xrange(s.doc_frequency("text", u("bravo"))): # Only un-optimized results are stable r = s.search(q, limit=i + 1, optimize=False) docnums = [hit.docnum for hit in r] assert docnums[:-1] == last last = docnums
def test_phrase_order(): tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()) schema = fields.Schema(text=tfield) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() for ls in permutations(["ape", "bay", "can", "day"], 4): writer.add_document(text=u(" ").join(ls)) writer.commit() with ix.searcher() as s: def result(q): r = s.search(q, limit=None, sortedby=None) return sorted([d['text'] for d in r]) q = query.Phrase("text", ["bay", "can", "day"]) assert_equal(result(q), [u('ape bay can day'), u('bay can day ape')])
def test_lengths2(): schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) count = 0 for _ in xrange(3): w = ix.writer() for ls in permutations(u("alfa bravo charlie").split()): if "bravo" in ls and "charlie" in ls: count += 1 w.add_document(text=u(" ").join(ls)) w.commit(merge=False) with ix.searcher() as s: q = query.Or([query.Term("text", u("bravo")), query.Term("text", u("charlie"))]) r = s.search(q, limit=None) assert len(r) == count r = s.search(q, limit=3) assert len(r) == count
def get_index(): global _ix if _ix is not None: return _ix charfield = fields.FieldType(formats.Characters(), analysis.SimpleAnalyzer(), scorable=True, stored=True) schema = fields.Schema(text=charfield) st = RamStorage() _ix = st.create_index(schema) w = _ix.writer() for ls in permutations(domain, 4): w.add_document(text=u(" ").join(ls), _stored_text=ls) w.commit() return _ix
def test_phrase_multi(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta echo").split() w = None for i, ls in enumerate(permutations(domain)): if w is None: w = ix.writer() w.add_document(id=i, text=u(" ").join(ls)) if not i % 30: w.commit() w = None if w is not None: w.commit() with ix.searcher() as s: q = query.Phrase("text", ["alfa", "bravo"]) _ = s.search(q)
def test_sorted_extend(): from whoosh import sorting schema = fields.Schema(title=fields.TEXT(stored=True), keywords=fields.TEXT, num=fields.NUMERIC(stored=True, sortable=True)) domain = u"alfa bravo charlie delta echo foxtrot golf hotel india".split() keys = u"juliet kilo lima november oskar papa quebec romeo".split() combined = 0 tcount = 0 kcount = 0 with TempIndex(schema) as ix: with ix.writer() as w: for i, words in enumerate(permutations(domain, 3)): key = keys[i % (len(domain) - 1)] if "bravo" in words: tcount += 1 if key == "kilo": kcount += 1 if "bravo" in words or key == "kilo": combined += 1 w.add_document(title=u" ".join(words), keywords=key, num=i) with ix.searcher() as s: facet = sorting.MultiFacet([ sorting.FieldFacet("num", reverse=True), sorting.ScoreFacet() ]) r1 = s.search(query.Term("title", "bravo"), limit=None, sortedby=facet) r2 = s.search(query.Term("keywords", "kilo"), limit=None, sortedby=facet) assert len(r1) == tcount assert len(r2) == kcount r1.extend(r2) assert len(r1) == combined
def test_multisegment(): from whoosh.filedb.multiproc import MpWriter schema = fields.Schema( a=fields.TEXT(stored=True, spelling=True, vector=True)) words = u("alfa bravo charlie delta echo").split() with TempIndex(schema) as ix: with ix.writer(procs=3, multisegment=True, batchsize=10) as w: assert_equal(w.__class__, MpWriter) assert w.multisegment for ls in permutations(words, 3): w.add_document(a=" ".join(ls)) assert_equal(len(ix._segments()), 3) with ix.searcher() as s: for word in words: r = s.search(query.Term("a", word)) for hit in r: assert word in hit["a"].split()
def test_multisegment(): check_multi() from whoosh.multiproc import MpWriter schema = fields.Schema(a=fields.TEXT(stored=True, spelling=True, vector=True)) words = u("alfa bravo charlie delta echo").split() with TempIndex(schema) as ix: with ix.writer(procs=3, multisegment=True, batchsize=10) as w: assert w.__class__ == MpWriter assert w.multisegment for ls in permutations(words, 3): w.add_document(a=u(" ").join(ls)) assert len(ix._segments()) == 3 with ix.searcher() as s: for word in words: r = s.search(query.Term("a", word)) for hit in r: assert word in hit["a"].split()
def test_sorted_extend(): from whoosh import sorting schema = fields.Schema(title=fields.TEXT(stored=True), keywords=fields.TEXT, num=fields.NUMERIC(stored=True, sortable=True)) domain = u"alfa bravo charlie delta echo foxtrot golf hotel india".split() keys = u"juliet kilo lima november oskar papa quebec romeo".split() combined = 0 tcount = 0 kcount = 0 with TempIndex(schema) as ix: with ix.writer() as w: for i, words in enumerate(permutations(domain, 3)): key = keys[i % (len(domain) - 1)] if "bravo" in words: tcount += 1 if key == "kilo": kcount += 1 if "bravo" in words or key == "kilo": combined += 1 w.add_document(title=u" ".join(words), keywords=key, num=i) with ix.searcher() as s: facet = sorting.MultiFacet([sorting.FieldFacet("num", reverse=True), sorting.ScoreFacet()]) r1 = s.search(query.Term("title", "bravo"), limit=None, sortedby=facet) r2 = s.search(query.Term("keywords", "kilo"), limit=None, sortedby=facet) assert len(r1) == tcount assert len(r2) == kcount r1.extend(r2) assert len(r1) == combined
def test_resultspage(): schema = fields.Schema(id=fields.STORED, content=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) domain = ("alfa", "bravo", "bravo", "charlie", "delta") w = ix.writer() for i, lst in enumerate(permutations(domain, 3)): w.add_document(id=text_type(i), content=u(" ").join(lst)) w.commit() with ix.searcher() as s: q = query.Term("content", u("bravo")) r = s.search(q, limit=10) tops = list(r) rp = s.search_page(q, 1, pagelen=5) assert rp.scored_length() == 5 assert list(rp) == tops[0:5] assert rp[10:] == [] rp = s.search_page(q, 2, pagelen=5) assert list(rp) == tops[5:10] rp = s.search_page(q, 1, pagelen=10) assert len(rp) == 54 assert rp.pagecount == 6 rp = s.search_page(q, 6, pagelen=10) assert len(list(rp)) == 4 assert rp.is_last_page() with pytest.raises(ValueError): s.search_page(q, 0) assert s.search_page(q, 10).pagenum == 6 rp = s.search_page(query.Term("content", "glonk"), 1) assert len(rp) == 0 assert rp.is_last_page()
def test_resultspage(): schema = fields.Schema(id=fields.STORED, content=fields.TEXT) ix = RamStorage().create_index(schema) domain = ("alfa", "bravo", "bravo", "charlie", "delta") w = ix.writer() for i, lst in enumerate(permutations(domain, 3)): w.add_document(id=text_type(i), content=u(" ").join(lst)) w.commit() with ix.searcher() as s: q = query.Term("content", u("bravo")) r = s.search(q, limit=10) tops = list(r) rp = s.search_page(q, 1, pagelen=5) assert_equal(rp.scored_length(), 5) assert_equal(list(rp), tops[0:5]) assert_equal(rp[10:], []) rp = s.search_page(q, 2, pagelen=5) assert_equal(list(rp), tops[5:10]) rp = s.search_page(q, 1, pagelen=10) assert_equal(len(rp), 54) assert_equal(rp.pagecount, 6) rp = s.search_page(q, 6, pagelen=10) assert_equal(len(list(rp)), 4) assert rp.is_last_page() assert_raises(ValueError, s.search_page, q, 0) assert_raises(ValueError, s.search_page, q, 7) rp = s.search_page(query.Term("content", "glonk"), 1) assert_equal(len(rp), 0) assert rp.is_last_page()
def test_ordered(): domain = u("alfa bravo charlie delta echo foxtrot").split(" ") schema = fields.Schema(f=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) writer = ix.writer() for ls in permutations(domain): writer.add_document(f=u(" ").join(ls)) writer.commit() with ix.searcher() as s: q = query.Ordered([query.Term("f", u("alfa")), query.Term("f", u("charlie")), query.Term("f", u("echo"))]) r = s.search(q) for hit in r: ls = hit["f"].split() assert "alfa" in ls assert "charlie" in ls assert "echo" in ls a = ls.index("alfa") c = ls.index("charlie") e = ls.index("echo") assert a < c and c < e, repr(ls)
def _do_basic(writerclass): # Create the domain data # List of individual words added to the index words = [] # List of string values added to the index docs = [] # A ring buffer for creating string values buf = deque() for ls in permutations(u("abcd")): word = "".join(ls) # Remember this word is in the index (to check lexicon) words.append(word) # Add this word on to the end, pop the first word off to create N word # documents where N <= 10 buf.append(word) if len(buf) > 10: buf.popleft() # Create a copy of the buffer and shuffle it to create a document value # and add it to the list of document values doc = list(buf) random.shuffle(doc) docs.append(" ".join(doc)) # Shuffle the list of document values random.shuffle(docs) schema = fields.Schema(text=fields.TEXT(stored=True, spelling=True, vector=True), row=fields.NUMERIC(stored=True)) with TempIndex(schema, storage_debug=True) as ix: # Add the domain data to the index with writerclass(ix, procs=3) as w: for i, value in enumerate(docs): w.add_document(text=value, row=i) with ix.searcher() as s: r = s.reader() # Check the lexicon for word, term in izip(words, r.field_terms("text")): assert word == term # Check the doc count assert r.doc_count_all() == len(docs) # Check the word graph assert r.has_word_graph("text") flat = [w.decode("latin1") for w in r.word_graph("text").flatten()] assert flat == words # Check there are lengths total = sum(r.doc_field_length(docnum, "text", 0) for docnum in xrange(r.doc_count_all())) assert total > 0 # Check per-doc info for i, value in enumerate(docs): pieces = value.split() docnum = s.document_number(row=i) # Check stored value sv = r.stored_fields(docnum) assert sv["text"] == value # Check vectors vr = r.vector(docnum, "text") # Get the terms and positions from the vector matcher iv = list(vr.items_as("positions")) # What the vector should look like ov = sorted((text, [i]) for i, text in enumerate(pieces)) assert iv == ov # Check field length assert r.doc_field_length(docnum, "text") == len(pieces)