def test_fieldboost(): schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, a=u("alfa bravo charlie"), b=u("echo foxtrot india")) w.add_document(id=1, a=u("delta bravo charlie"), b=u("alfa alfa alfa")) w.add_document(id=2, a=u("alfa alfa alfa"), b=u("echo foxtrot india")) w.add_document(id=3, a=u("alfa sierra romeo"), b=u("alfa tango echo")) w.add_document(id=4, a=u("bravo charlie delta"), b=u("alfa foxtrot india")) w.add_document(id=5, a=u("alfa alfa echo"), b=u("tango tango tango")) w.add_document(id=6, a=u("alfa bravo echo"), b=u("alfa alfa tango")) w.commit() def field_booster(fieldname, factor=2.0): "Returns a function which will boost the given field in a query tree" def booster_fn(obj): if obj.is_leaf() and obj.field() == fieldname: obj = copy.deepcopy(obj) obj.boost *= factor return obj else: return obj return booster_fn with ix.searcher() as s: q = Or([Term("a", u("alfa")), Term("b", u("alfa"))]) q = q.accept(field_booster("a", 100.0)) assert_equal(text_type(q), text_type("(a:alfa^100.0 OR b:alfa)")) r = s.search(q) assert_equal([hit["id"] for hit in r], [2, 5, 6, 3, 0, 1, 4])
def __unicode__(self): r = u("DisMax(") r += " ".join(sorted(text_type(s) for s in self.subqueries)) r += u(")") if self.tiebreak: r += u("~") + text_type(self.tiebreak) return r
def test_gtlt(): schema = fields.Schema(a=fields.KEYWORD, b=fields.NUMERIC, c=fields.KEYWORD, d=fields.NUMERIC(float), e=fields.DATETIME) qp = qparser.QueryParser("a", schema) qp.add_plugin(plugins.GtLtPlugin()) qp.add_plugin(dateparse.DateParserPlugin()) q = qp.parse(u("a:hello b:>100 c:<=z there")) assert_equal(q.__class__, query.And) assert_equal(len(q), 4) assert_equal(q[0], query.Term("a", "hello")) assert_equal(q[1], query.NumericRange("b", 100, None, startexcl=True)) assert_equal(q[2], query.TermRange("c", None, 'z')) assert_equal(q[3], query.Term("a", "there")) q = qp.parse(u("hello e:>'29 mar 2001' there")) assert_equal(q.__class__, query.And) assert_equal(len(q), 3) assert_equal(q[0], query.Term("a", "hello")) # As of this writing, date ranges don't support startexcl/endexcl assert_equal(q[1], query.DateRange("e", datetime(2001, 3, 29, 0, 0), None)) assert_equal(q[2], query.Term("a", "there")) q = qp.parse(u("a:> alfa c:<= bravo")) assert_equal(text_type(q), "(a:a: AND a:alfa AND a:c: AND a:bravo)") qp.remove_plugin_class(plugins.FieldsPlugin) qp.remove_plugin_class(plugins.RangePlugin) q = qp.parse(u("hello a:>500 there")) assert_equal(text_type(q), "(a:hello AND a:a: AND a:500 AND a:there)")
def test_dismax(): parser = default.DisMaxParser({"body": 0.8, "title": 2.5}, None) q = parser.parse(u("alfa bravo charlie")) assert_equal(text_type(q), "(DisMax(body:alfa^0.8 title:alfa^2.5) OR " + "DisMax(body:bravo^0.8 title:bravo^2.5) OR " + "DisMax(body:charlie^0.8 title:charlie^2.5))") q = parser.parse(u("alfa +bravo charlie")) assert_equal(text_type(q), "(DisMax(body:bravo^0.8 title:bravo^2.5) ANDMAYBE " + "(DisMax(body:alfa^0.8 title:alfa^2.5) OR " + "DisMax(body:charlie^0.8 title:charlie^2.5)))") q = parser.parse(u("alfa -bravo charlie")) assert_equal(text_type(q), "((DisMax(body:alfa^0.8 title:alfa^2.5) OR " + "DisMax(body:charlie^0.8 title:charlie^2.5)) ANDNOT " + "DisMax(body:bravo^0.8 title:bravo^2.5))") q = parser.parse(u("alfa -bravo +charlie")) assert_equal(text_type(q), "((DisMax(body:charlie^0.8 title:charlie^2.5) ANDMAYBE " + "DisMax(body:alfa^0.8 title:alfa^2.5)) ANDNOT " + "DisMax(body:bravo^0.8 title:bravo^2.5))")
def test_fieldname_chars(): s = fields.Schema(abc123=fields.TEXT, nisbah=fields.KEYWORD) qp = default.QueryParser("content", s) fieldmap = {'nisbah': [u('\u0646\u0633\u0628\u0629')], 'abc123': ['xyz']} qp.add_plugin(plugins.FieldAliasPlugin(fieldmap)) q = qp.parse(u("abc123:456")) assert_equal(q.__class__, query.Term) assert_equal(q.fieldname, u('abc123')) assert_equal(q.text, u('456')) q = qp.parse(u("abc123:456 def")) assert_equal(text_type(q), u("(abc123:456 AND content:def)")) q = qp.parse(u('\u0646\u0633\u0628\u0629:\u0627\u0644\u0641\u0644\u0633' '\u0637\u064a\u0646\u064a')) assert_equal(q.__class__, query.Term) assert_equal(q.fieldname, u('nisbah')) assert_equal(q.text, u('\u0627\u0644\u0641\u0644\u0633\u0637\u064a\u0646\u064a')) q = qp.parse(u("abc123 (xyz:123 OR qrs)")) assert_equal(text_type(q), "(content:abc123 AND (abc123:123 OR content:qrs))")
def test_not_assoc(): qp = default.QueryParser("text", None) q = qp.parse(u("a AND NOT b OR c")) assert_equal(text_type(q), "((text:a AND NOT text:b) OR text:c)") qp = default.QueryParser("text", None) q = qp.parse(u("a NOT (b OR c)")) assert_equal(text_type(q), "(text:a AND NOT (text:b OR text:c))")
def test_not_assoc(): qp = default.QueryParser("text", None) q = qp.parse(u("a AND NOT b OR c")) assert text_type(q) == "((text:a AND NOT text:b) OR text:c)" qp = default.QueryParser("text", None) q = qp.parse(u("a NOT (b OR c)")) assert text_type(q) == "(text:a AND NOT (text:b OR text:c))"
def test_paren_fieldname(): schema = fields.Schema(kind=fields.ID, content=fields.TEXT) qp = default.QueryParser("content", schema) q = qp.parse(u("(kind:1d565 OR kind:7c584) AND (stuff)")) assert text_type(q) == "((kind:1d565 OR kind:7c584) AND content:stuff)" q = qp.parse(u("kind:(1d565 OR 7c584) AND (stuff)")) assert text_type(q) == "((kind:1d565 OR kind:7c584) AND content:stuff)"
def test_paren_fieldname(): schema = fields.Schema(kind=fields.ID, content=fields.TEXT) qp = default.QueryParser("content", schema) q = qp.parse(u("(kind:1d565 OR kind:7c584) AND (stuff)")) assert_equal(text_type(q), "((kind:1d565 OR kind:7c584) AND content:stuff)") q = qp.parse(u("kind:(1d565 OR 7c584) AND (stuff)")) assert_equal(text_type(q), "((kind:1d565 OR kind:7c584) AND content:stuff)")
def test_operator_queries(): qp = default.QueryParser("f", None) q = qp.parse("a AND b OR c AND d") assert_equal(text_type(q), "((f:a AND f:b) OR (f:c AND f:d))") q = qp.parse("a OR b OR c OR d") assert_equal(text_type(q), "(f:a OR f:b OR f:c OR f:d)") q = qp.parse("a ANDMAYBE b ANDNOT c REQUIRE d") assert_equal(text_type(q), "((f:a ANDMAYBE (f:b ANDNOT f:c)) REQUIRE f:d)")
def test_operator_queries(): qp = default.QueryParser("f", None) q = qp.parse("a AND b OR c AND d") assert text_type(q) == "((f:a AND f:b) OR (f:c AND f:d))" q = qp.parse("a OR b OR c OR d") assert text_type(q) == "(f:a OR f:b OR f:c OR f:d)" q = qp.parse("a ANDMAYBE b ANDNOT c REQUIRE d") assert text_type(q) == "((f:a ANDMAYBE (f:b ANDNOT f:c)) REQUIRE f:d)"
def test_simple(): parser = default.SimpleParser("x", None) q = parser.parse(u("alfa bravo charlie delta")) assert_equal(text_type(q), "(x:alfa OR x:bravo OR x:charlie OR x:delta)") q = parser.parse(u("alfa +bravo charlie delta")) assert_equal(text_type(q), "(x:bravo ANDMAYBE (x:alfa OR x:charlie OR x:delta))") q = parser.parse(u("alfa +bravo -charlie delta")) assert_equal(text_type(q), "((x:bravo ANDMAYBE (x:alfa OR x:delta)) ANDNOT x:charlie)") q = parser.parse(u("- alfa +bravo + delta")) assert_equal(text_type(q), "((x:bravo AND x:delta) ANDNOT x:alfa)")
def test_dismax(): parser = default.DisMaxParser({"body": 0.8, "title": 2.5}, None) q = parser.parse(u("alfa bravo charlie")) assert_equal(text_type(q), "(DisMax(body:alfa^0.8 title:alfa^2.5) OR DisMax(body:bravo^0.8 title:bravo^2.5) OR DisMax(body:charlie^0.8 title:charlie^2.5))") q = parser.parse(u("alfa +bravo charlie")) assert_equal(text_type(q), "(DisMax(body:bravo^0.8 title:bravo^2.5) ANDMAYBE (DisMax(body:alfa^0.8 title:alfa^2.5) OR DisMax(body:charlie^0.8 title:charlie^2.5)))") q = parser.parse(u("alfa -bravo charlie")) assert_equal(text_type(q), "((DisMax(body:alfa^0.8 title:alfa^2.5) OR DisMax(body:charlie^0.8 title:charlie^2.5)) ANDNOT DisMax(body:bravo^0.8 title:bravo^2.5))") q = parser.parse(u("alfa -bravo +charlie")) assert_equal(text_type(q), "((DisMax(body:charlie^0.8 title:charlie^2.5) ANDMAYBE DisMax(body:alfa^0.8 title:alfa^2.5)) ANDNOT DisMax(body:bravo^0.8 title:bravo^2.5))")
def test_update2(): schema = fields.Schema(key=fields.ID(unique=True, stored=True), p=fields.ID(stored=True)) with TempIndex(schema, "update2") as ix: nums = list(range(21)) random.shuffle(nums) for i, n in enumerate(nums): w = ix.writer() w.update_document(key=text_type(n % 10), p=text_type(i)) w.commit() with ix.searcher() as s: results = [d["key"] for _, d in s.iter_docs()] results = " ".join(sorted(results)) assert results == "0 1 2 3 4 5 6 7 8 9"
def test_update2(): schema = fields.Schema(key=fields.ID(unique=True, stored=True), p=fields.ID(stored=True)) with TempIndex(schema, "update2") as ix: nums = list(range(100)) random.shuffle(nums) for i, n in enumerate(nums): w = ix.writer() w.update_document(key=text_type(n % 10), p=text_type(i)) w.commit() with ix.searcher() as s: results = [d["key"] for d in s.all_stored_fields()] results.sort() assert_equal(results, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])
def test_andnot(): qp = default.QueryParser("content", None) q = qp.parse(u("this ANDNOT that")) assert q.__class__ == query.AndNot assert q.a.__class__ == query.Term assert q.b.__class__ == query.Term assert q.a.text == "this" assert q.b.text == "that" q = qp.parse(u("foo ANDNOT bar baz")) assert q.__class__ == query.And assert len(q) == 2 assert q[0].__class__ == query.AndNot assert q[1].__class__ == query.Term q = qp.parse(u("foo fie ANDNOT bar baz")) assert q.__class__ == query.And assert len(q) == 3 assert q[0].__class__ == query.Term assert q[1].__class__ == query.AndNot assert q[2].__class__ == query.Term q = qp.parse(u("a AND b ANDNOT c")) assert q.__class__ == query.AndNot assert text_type(q) == "((content:a AND content:b) ANDNOT content:c)"
def test_asyncwriter_no_stored(): schema = fields.Schema(id=fields.ID, text=fields.TEXT) with TempIndex(schema, "asyncnostored") as ix: domain = (u"alfa", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot", u"golf", u"hotel", u"india") writers = [] # Simulate doing 20 (near-)simultaneous commits. If we weren't using # AsyncWriter, at least some of these would fail because the first # writer wouldn't be finished yet. for i in xrange(20): w = writing.AsyncWriter(ix) writers.append(w) w.add_document(id=text_type(i), text=u" ".join(random.sample(domain, 5))) w.commit() # Wait for all writers to finish before checking the results for w in writers: if w.running: w.join() # Check whether all documents made it into the index. with ix.reader() as r: assert sorted([int(id) for id in r.lexicon("id")]) == list(range(20))
def test_buffered_update(): schema = fields.Schema(id=fields.ID(stored=True, unique=True), payload=fields.STORED) with TempIndex(schema, "bufferedupdate") as ix: w = writing.BufferedWriter(ix, period=None, limit=5) for i in xrange(10): for char in u"abc": fs = dict(id=char, payload=text_type(i) + char) w.update_document(**fs) with w.reader() as r: sfs = [sf for _, sf in r.iter_docs()] sfs = sorted(sfs, key=lambda x: x["id"]) assert sfs == [{ 'id': u('a'), 'payload': u('9a') }, { 'id': u('b'), 'payload': u('9b') }, { 'id': u('c'), 'payload': u('9c') }] assert r.doc_count() == 3 w.close()
def test_andnot(): qp = default.QueryParser("content", None) q = qp.parse(u("this ANDNOT that")) assert_equal(q.__class__, query.AndNot) assert_equal(q.a.__class__, query.Term) assert_equal(q.b.__class__, query.Term) assert_equal(q.a.text, "this") assert_equal(q.b.text, "that") q = qp.parse(u("foo ANDNOT bar baz")) assert_equal(q.__class__, query.And) assert_equal(len(q), 2) assert_equal(q[0].__class__, query.AndNot) assert_equal(q[1].__class__, query.Term) q = qp.parse(u("foo fie ANDNOT bar baz")) assert_equal(q.__class__, query.And) assert_equal(len(q), 3) assert_equal(q[0].__class__, query.Term) assert_equal(q[1].__class__, query.AndNot) assert_equal(q[2].__class__, query.Term) q = qp.parse(u("a AND b ANDNOT c")) assert_equal(q.__class__, query.AndNot) assert_equal(text_type(q), "((content:a AND content:b) ANDNOT content:c)")
def __unicode__(self): r = u("(") r += (self.JOINT).join([text_type(s) for s in self.subqueries]) r += u(")") if self.minmatch: r += u(">%s") % self.minmatch return r
def test_asyncwriter_no_stored(): schema = fields.Schema(id=fields.ID, text=fields.TEXT) with TempIndex(schema, "asyncnostored") as ix: domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india")) writers = [] # Simulate doing 20 (near-)simultaneous commits. If we weren't using # AsyncWriter, at least some of these would fail because the first # writer wouldn't be finished yet. for i in xrange(20): w = writing.AsyncWriter(ix) writers.append(w) w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.commit() # Wait for all writers to finish before checking the results for w in writers: if w.running: w.join() # Check whether all documents made it into the index. with ix.reader() as r: assert_equal(sorted([int(id) for id in r.lexicon("id")]), list(range(20)))
def test_buffered_update(): schema = fields.Schema(id=fields.ID(stored=True, unique=True), payload=fields.STORED) with TempIndex(schema, "bufferedupdate") as ix: w = writing.BufferedWriter(ix, period=None, limit=5) for i in xrange(10): for char in u("abc"): fs = dict(id=char, payload=text_type(i) + char) w.update_document(**fs) with w.reader() as r: assert_equal(sorted(r.all_stored_fields(), key=lambda x: x["id"]), [{ 'id': u('a'), 'payload': u('9a') }, { 'id': u('b'), 'payload': u('9b') }, { 'id': u('c'), 'payload': u('9c') }]) assert_equal(r.doc_count(), 3) w.close()
def test_field_alias(): qp = qparser.QueryParser("content", None) qp.add_plugin(plugins.FieldAliasPlugin({"title": ("article", "caption")})) q = qp.parse("alfa title:bravo article:charlie caption:delta") assert_equal( text_type(q), u("(content:alfa AND title:bravo AND title:charlie AND " + "title:delta)"))
def test_copyfield(): qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, None)) assert_equal(text_type(qp.parse("hello b:matt")), "(a:hello AND b:matt AND c:matt)") qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.AndMaybeGroup)) assert_equal(text_type(qp.parse("hello b:matt")), "(a:hello AND (b:matt ANDMAYBE c:matt))") qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.RequireGroup)) assert_equal(text_type(qp.parse("hello (there OR b:matt)")), "(a:hello AND (a:there OR (b:matt REQUIRE c:matt)))") qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"a": "c"}, syntax.OrGroup)) assert_equal(text_type(qp.parse("hello there")), "((a:hello OR c:hello) AND (a:there OR c:there))") qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, mirror=True)) assert_equal(text_type(qp.parse("hello c:matt")), "(a:hello AND (c:matt OR b:matt))") qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"c": "a"}, mirror=True)) assert_equal(text_type(qp.parse("hello c:matt")), "((a:hello OR c:hello) AND (c:matt OR a:matt))") ana = analysis.RegexAnalyzer(r"\w+") | analysis.DoubleMetaphoneFilter() fmt = formats.Frequency() schema = fields.Schema(name=fields.KEYWORD, name_phone=fields.FieldType(fmt, ana, multitoken_query="or")) qp = qparser.QueryParser("name", schema) qp.add_plugin(plugins.CopyFieldPlugin({"name": "name_phone"})) assert_equal(text_type(qp.parse(u("spruce view"))), "((name:spruce OR name_phone:SPRS) AND (name:view OR name_phone:F OR name_phone:FF))")
def test_multifield(): schema = fields.Schema(content=fields.TEXT, title=fields.TEXT, cat=fields.KEYWORD, date=fields.DATETIME) qs = u("a (b c cat:d) OR (b c cat:e)") qp = default.MultifieldParser(['x', 'y'], schema) q = qp.parse(qs) assert_equal(text_type(q), "((x:a OR y:a) AND (((x:b OR y:b) AND (x:c OR y:c) AND cat:d) OR ((x:b OR y:b) AND (x:c OR y:c) AND cat:e)))")
def _check(schema, **kwargs): ix = RamStorage().create_index(schema) with ix.writer() as w: for i, text in enumerate(docs): w.add_document(id=text_type(i + 1), text=text) with ix.searcher() as s: docnum = s.document_number(id=u("1")) r = s.more_like(docnum, "text", model=model, **kwargs) assert [hit["id"] for hit in r] == ["6", "2", "3"]
def _check(schema, **kwargs): ix = RamStorage().create_index(schema) with ix.writer() as w: for i, text in enumerate(docs): w.add_document(id=text_type(i + 1), text=text) with ix.searcher() as s: docnum = s.document_number(id=u("1")) r = s.more_like(docnum, "text", **kwargs) assert [hit["id"] for hit in r] == ["6", "2", "3"]
def test_bigsort(): times = 30000 dirname = "testindex" df = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=df) if os.path.exists(dirname): shutil.rmtree(dirname) os.mkdir(dirname) ix = index.create_in(dirname, schema) print("Writing...") t = now() w = ix.writer(limitmb=512) for i in xrange(times): dt = datetime.fromtimestamp(random.randint(15839593, 1294102139)) w.add_document(id=text_type(i), date=dt) w.commit() print("Writing took ", now() - t) ix = index.open_dir(dirname) s = ix.searcher() q = query.Wildcard("id", "1?2*") t = now() x = list(df.sortable_terms(s.reader(), "date")) print(now() - t, len(x)) t = now() for y in x: p = list(s.postings("date", y).all_ids()) print(now() - t) t = now() r = s.search(q, limit=25, sortedby="date", reverse=True) print("Search 1 took", now() - t) print("len=", r.scored_length()) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) from heapq import nlargest t = now() sf = s.stored_fields gen = ((sf(n)["date"], n) for n in q.docs(s)) r = nlargest(25, gen) print(now() - t)
def test_bigsort(): times = 30000 dirname = "testindex" df = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=df) if os.path.exists(dirname): shutil.rmtree(dirname) os.mkdir(dirname) ix = index.create_in(dirname, schema) print("Writing...") t = now() w = ix.writer(limitmb=512) for i in xrange(times): dt = datetime.fromtimestamp(random.randint(15839593, 1294102139)) w.add_document(id=text_type(i), date=dt) w.commit() print("Writing took ", now() - t) ix = index.open_dir(dirname) s = ix.searcher() q = query.Wildcard("id", "1?2*") t = now() x = list(df.sortable_values(s.reader(), "date")) print(now() - t, len(x)) t = now() for y in x: p = list(s.postings("date", y).all_ids()) print(now() - t) t = now() r = s.search(q, limit=25, sortedby="date", reverse=True) print("Search 1 took", now() - t) print("len=", r.scored_length()) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) from heapq import nlargest t = now() sf = s.stored_fields gen = ((sf(n)["date"], n) for n in q.docs(s)) r = nlargest(25, gen) print(now() - t)
def __unicode__(self): text = self.text if isinstance(text, bytes_type): try: text = text.decode("ascii") except UnicodeDecodeError: text = repr(text) t = u("%s:%s") % (self.fieldname, text) if self.boost != 1: t += u("^") + text_type(self.boost) return t
def test_many_updates(): schema = fields.Schema(key=fields.ID(unique=True, stored=True)) with TempIndex(schema, "manyupdates") as ix: for _ in xrange(10000): num = random.randint(0, 5000) w = ix.writer() w.update_document(key=text_type(num)) w.commit() with ix.searcher() as s: result = [d["key"] for d in s.search(query.Every())] assert_equal(len(result), len(set(result)))
def test_fieldname_chars(): s = fields.Schema(abc123=fields.TEXT, nisbah=fields.KEYWORD) qp = default.QueryParser("content", s) fieldmap = {'nisbah': [u('\u0646\u0633\u0628\u0629')], 'abc123': ['xyz']} qp.add_plugin(plugins.FieldAliasPlugin(fieldmap)) q = qp.parse(u("abc123:456")) assert_equal(q.__class__, query.Term) assert_equal(q.fieldname, u('abc123')) assert_equal(q.text, u('456')) q = qp.parse(u("abc123:456 def")) assert_equal(text_type(q), u("(abc123:456 AND content:def)")) q = qp.parse(u('\u0646\u0633\u0628\u0629:\u0627\u0644\u0641\u0644\u0633\u0637\u064a\u0646\u064a')) assert_equal(q.__class__, query.Term) assert_equal(q.fieldname, u('nisbah')) assert_equal(q.text, u('\u0627\u0644\u0641\u0644\u0633\u0637\u064a\u0646\u064a')) q = qp.parse(u("abc123 (xyz:123 OR qrs)")) assert_equal(text_type(q), "(content:abc123 AND (abc123:123 OR content:qrs))")
def test_no_stored(): schema = fields.Schema(id=fields.ID, text=fields.TEXT) with TempIndex(schema, "nostored") as ix: domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india")) w = ix.writer() for i in xrange(20): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.commit() with ix.reader() as r: assert_equal(sorted([int(id) for id in r.lexicon("id")]), list(range(20)))
def test_buffered(): schema = fields.Schema(id=fields.ID, text=fields.TEXT) with TempIndex(schema, "buffered") as ix: domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india")) w = writing.BufferedWriter(ix, period=None, limit=10, commitargs={"merge": False}) for i in xrange(100): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) time.sleep(0.5) w.close() assert_equal(len(ix._segments()), 10)
def make_index(storage, indexname, word2nums, num2words): """Creates a Whoosh index in the given storage object containing synonyms taken from word2nums and num2words. Returns the Index object. """ schema = Schema(word=ID, syns=STORED) ix = storage.create_index(schema, indexname=indexname) w = ix.writer() for word in iterkeys(word2nums): syns = synonyms(word2nums, num2words, word) w.add_document(word=text_type(word), syns=syns) w.commit() return ix
def run(self): ix = st.create_index(dir, schema) num = 0 for i in xrange(50): print(i) w = ix.writer() for _ in xrange(random.randint(1, 100)): content = u(" ").join(random.sample(domain, random.randint(5, 20))) w.add_document(id=text_type(num), content=content) num += 1 w.commit() time.sleep(0.1)
def test_update_numeric(): schema = fields.Schema(num=fields.NUMERIC(unique=True, stored=True), text=fields.ID(stored=True)) with TempIndex(schema, "updatenum") as ix: nums = list(range(5)) * 3 random.shuffle(nums) for num in nums: with ix.writer() as w: w.update_document(num=num, text=text_type(num)) with ix.searcher() as s: results = [d["text"] for _, d in s.iter_docs()] results = " ".join(sorted(results)) assert results == "0 1 2 3 4"
def test_no_stored(): schema = fields.Schema(id=fields.ID, text=fields.TEXT) with TempIndex(schema, "nostored") as ix: domain = (u"alfa", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot", u"golf", u"hotel", u"india") w = ix.writer() for i in xrange(20): w.add_document(id=text_type(i), text=u" ".join(random.sample(domain, 5))) w.commit() with ix.reader() as r: assert sorted([int(id) for id in r.lexicon("id")]) == list(range(20))
def test_finish_segment(): check_multi() from whoosh.multiproc import MpWriter schema = fields.Schema(a=fields.KEYWORD(stored=True)) with TempIndex(schema) as ix: w = MpWriter(ix, procs=2, batchsize=1, multisegment=False, limitmb=0.00001) for i in range(100): w.add_document(a=text_type(i) * 10) w.commit()
def test_andor(): qp = default.QueryParser("a", None) q = qp.parse("a AND b OR c AND d OR e AND f") assert text_type(q) == "((a:a AND a:b) OR (a:c AND a:d) OR (a:e AND a:f))" q = qp.parse("aORb") assert q == query.Term("a", "aORb") q = qp.parse("aOR b") assert q == query.And([query.Term("a", "aOR"), query.Term("a", "b")]) q = qp.parse("a ORb") assert q == query.And([query.Term("a", "a"), query.Term("a", "ORb")]) assert qp.parse("OR") == query.Term("a", "OR")
def test_andor(): qp = default.QueryParser("a", None) q = qp.parse("a AND b OR c AND d OR e AND f") assert_equal(text_type(q), "((a:a AND a:b) OR (a:c AND a:d) OR (a:e AND a:f))") q = qp.parse("aORb") assert_equal(q, query.Term("a", "aORb")) q = qp.parse("aOR b") assert_equal(q, query.And([query.Term("a", "aOR"), query.Term("a", "b")])) q = qp.parse("a ORb") assert_equal(q, query.And([query.Term("a", "a"), query.Term("a", "ORb")])) assert_equal(qp.parse("OR"), query.Term("a", "OR"))
def test_buffered(): schema = fields.Schema(id=fields.ID, text=fields.TEXT) with TempIndex(schema, "buffered") as ix: domain = u"alfa bravo charlie delta echo foxtrot golf hotel india" domain = domain.split() w = writing.BufferedWriter(ix, period=None, limit=10, commitargs={"merge": False}) for i in xrange(20): w.add_document(id=text_type(i), text=u" ".join(random.sample(domain, 5))) time.sleep(0.1) w.close() assert len(ix._segments()) == 2