def test_searching(): with make_index().searcher() as s: def _runq(q, result, **kwargs): r = s.search(q, **kwargs) assert_equal([d["id"] for d in r], result) _runq(query.Term("text", u("format")), ["format", "vector"]) _runq(query.Term("text", u("the")), ["fieldtype", "format", "const", "vector", "stored"]) _runq(query.Prefix("text", u("st")), ["format", "vector", "stored"]) _runq(query.Wildcard("id", u("*st*")), ["stored", "const"]) _runq(query.TermRange("id", u("c"), u("s")), ["fieldtype", "format", "const"]) _runq(query.NumericRange("subs", 10, 100), ["fieldtype", "format", "vector", "scorable"]) _runq(query.Phrase("text", ["this", "field"]), ["scorable", "unique", "stored"], limit=None) _runq(query.Every(), [ "fieldtype", "format", "vector", "scorable", "stored", "unique", "const" ]) _runq(query.Every("subs"), [ "fieldtype", "format", "vector", "scorable", "stored", "unique", "const" ])
def test_bigsort(): times = 30000 dirname = "testindex" df = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=df) if os.path.exists(dirname): shutil.rmtree(dirname) os.mkdir(dirname) ix = index.create_in(dirname, schema) print("Writing...") t = now() w = ix.writer(limitmb=512) for i in xrange(times): dt = datetime.fromtimestamp(random.randint(15839593, 1294102139)) w.add_document(id=text_type(i), date=dt) w.commit() print("Writing took ", now() - t) ix = index.open_dir(dirname) s = ix.searcher() q = query.Wildcard("id", "1?2*") t = now() x = list(df.sortable_values(s.reader(), "date")) print(now() - t, len(x)) t = now() for y in x: p = list(s.postings("date", y).all_ids()) print(now() - t) t = now() r = s.search(q, limit=25, sortedby="date", reverse=True) print("Search 1 took", now() - t) print("len=", r.scored_length()) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) t = now() r = s.search(q, limit=25, sortedby="date") print("Search 2 took", now() - t) from heapq import nlargest t = now() sf = s.stored_fields gen = ((sf(n)["date"], n) for n in q.docs(s)) r = nlargest(25, gen) print(now() - t)
def test_tamilprefix(self): f = open( '/home/nanditha/projects/tamilthedal/trunk/src/encyclopedia/utilities/pyunitwildtext' ) cont = f.readline() text = cont.split(':') index = open_dir(settings.INDEX_PATH) wildtext = unicode(str(text[0]), 'utf-8') + u'*' qp = query.Wildcard("content", wildtext) srch = index.searcher() res = srch.search(qp) self.assertNotEqual(len(res), 0) print len(res), 'results'
def test_patterns(): domain = u("aaron able acre adage aether after ago ahi aim ajax akimbo " "alembic all amiga amount ampere").split() schema = fields.Schema(word=fields.KEYWORD(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for word in domain: w.add_document(word=word) with ix.reader() as r: assert_equal(list(r.lexicon("word")), domain) assert_equal(list(r.expand_prefix("word", "al")), ["alembic", "all"]) q = query.Prefix("word", "al") assert_equal(q.simplify(r).__unicode__(), "(word:alembic OR word:all)") q = query.Wildcard("word", "a*[ae]") assert_equal(q.simplify(r).__unicode__(), "(word:able OR word:acre OR word:adage OR " + "word:amiga OR word:ampere)") assert_equal(q._find_prefix(q.text), "a") q = query.Regex("word", "am.*[ae]") assert_equal(q.simplify(r).__unicode__(), "(word:amiga OR word:ampere)") assert_equal(q._find_prefix(q.text), "am") q = query.Regex("word", "able|ago") assert_equal(q.simplify(r).__unicode__(), "(word:able OR word:ago)") assert_equal(q._find_prefix(q.text), "") # special case: ? may mean "zero occurences" q = query.Regex("word", "ah?i") assert_equal(q.simplify(r).__unicode__(), "(word:ahi OR word:aim)") assert_equal(q._find_prefix(q.text), "a") # special case: * may mean "zero occurences" q = query.Regex("word", "ah*i") assert_equal(q.simplify(r).__unicode__(), "(word:ahi OR word:aim)") assert_equal(q._find_prefix(q.text), "a")
def make_wildcard(self, fieldname, text): fieldname = fieldname or self.default_field return query.Wildcard(fieldname or self.default_field, text)
def test_wildcard(): _run_query(query.Or([query.Wildcard('value', u('*red*')), query.Wildcard('name', u('*yellow*'))]), [u("A"), u("C"), u("D"), u("E")]) # Missing _run_query(query.Wildcard('value', 'glonk*'), [])
def searchPapers_whoosh(year=None, author=None, topic=None, userQuery=None): # Open the existing index import whoosh.index as index import nltk nltk.download('wordnet') from nltk.stem.wordnet import WordNetLemmatizer lemma = WordNetLemmatizer() userQuery = " ".join( lemma.lemmatize(word, 'n') for word in userQuery.split()) userQuery = " ".join( lemma.lemmatize(word, 'v') for word in userQuery.split()) index_dir = "../index" ix = index.open_dir(index_dir) if topic == 'All the topics': topic = None if year == 'All the years': year = None # Parse with filter on fields from whoosh import query from whoosh import qparser from whoosh.qparser import QueryParser from whoosh.qparser import MultifieldParser with ix.searcher() as s: if (not userQuery): qp = QueryParser("id", schema=ix.schema) user_q = qp.parse("*") else: # 0 = importance to documents with one of the terms # 1 = importance to documents with all of the terms og = qparser.OrGroup.factory(0.8) # search both in title and text mparser = MultifieldParser(["title", "paper_text"], schema=ix.schema, group=og) user_q = mparser.parse(userQuery) # Filter results for fields allow_q = query.NullQuery if (year): allow_q = allow_q & query.Term("year", year) if (author): formattedAuthors = author.lower().split() for fa in formattedAuthors: fa = "*" + fa + "*" allow_q = allow_q & query.Wildcard("authors", fa) if (topic): topicParser = qparser.QueryParser("topic", ix.schema) allow_q = allow_q & topicParser.parse('"' + topic + '"') if (not year and not author and not topic): results = s.search(user_q, limit=50) else: results = s.search(user_q, filter=allow_q, limit=50) papers = [] for result in results: papers.extend([int(result['id'])]) return papers