def indexloc(self): from sidr.orm import db import json from whoosh.index import create_in, open_dir, exists_in from whoosh import fields, qparser, query schema = fields.Schema(gid=fields.TEXT(stored=True), country_code=fields.ID(stored=True), names=fields.NGRAMWORDS(stored=True, minsize=3, maxsize=15)) if not exists_in("indexer", indexname="adms"): ix = create_in("indexer", schema, indexname="adms") ix = open_dir("indexer", indexname="adms") writer = ix.writer() """ with ix.searcher() as s: qp = qparser.QueryParser("names", schema=ix.schema) q = qp.parse(u"Westonia") # results = s.search(q, limit=20, filter=query.Term("country_code", "AU")) results = s.documents() # results = searcher.search('hey', terms=True) # qp = qparser.QueryParser("content", ix.schema) # results = searcher.search(user_q) for res in results: print(repr(res)) """ rows = db.engine.execute('SELECT * FROM geoname') for row in rows: writer.add_document( gid=str(row['id']), country_code=row['country_code'], names="%s , %s , %s" % (row['name'], row['asciiname'], row['name_alternate'])) writer.commit()
def test_ngramwords(): schema = fields.Schema(grams=fields.NGRAMWORDS(queryor=True)) parser = default.QueryParser('grams', schema) q = parser.parse(u("Hello Tom")) assert q.__class__ == query.And assert q[0].__class__ == query.Or assert q[1].__class__ == query.Term assert q[0][0].text == "hell" assert q[0][1].text == "ello" assert q[1].text == "tom"
def test_ngramwords(): schema = fields.Schema(grams=fields.NGRAMWORDS(queryor=True)) parser = default.QueryParser('grams', schema) q = parser.parse(u("Hello Tom")) assert_equal(q.__class__, query.And) assert_equal(q[0].__class__, query.Or) assert_equal(q[1].__class__, query.Term) assert_equal(q[0][0].text, "hell") assert_equal(q[0][1].text, "ello") assert_equal(q[1].text, "tom")
def test_highlight_ngrams(): schema = fields.Schema(text=fields.NGRAMWORDS(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("Multiplication and subtraction are good")) with ix.searcher() as s: qp = qparser.QueryParser("text", ix.schema) q = qp.parse(u("multiplication")) r = s.search(q) assert r.scored_length() == 1 r.fragmenter = highlight.SentenceFragmenter() r.formatter = highlight.UppercaseFormatter() snippet = r[0].highlights("text") assert snippet == "MULTIPLICATIon and subtracTION are good"
def test_nested_skip(): schema = fields.Schema( id=fields.ID(unique=True, stored=True), name=fields.TEXT(stored=True), name_ngrams=fields.NGRAMWORDS(minsize=4, field_boost=1.2), type=fields.TEXT, ) domain = [(u"book_1", u"The Dark Knight Returns", u"book"), (u"chapter_1", u"The Dark Knight Returns", u"chapter"), (u"chapter_2", u"The Dark Knight Triumphant", u"chapter"), (u"chapter_3", u"Hunt the Dark Knight", u"chapter"), (u"chapter_4", u"The Dark Knight Falls", u"chapter")] with TempIndex(schema) as ix: with ix.writer() as w: for id, name, typ in domain: w.add_document(id=id, name=name, name_ngrams=name, type=typ) with ix.searcher() as s: all_parents = query.Term("type", "book") wanted_parents = query.Term("name", "dark") children_of_wanted_parents = query.NestedChildren( all_parents, wanted_parents) r1 = s.search(children_of_wanted_parents) assert r1.scored_length() == 4 assert [hit["id"] for hit in r1 ] == ["chapter_1", "chapter_2", "chapter_3", "chapter_4"] wanted_children = query.And( [query.Term("type", "chapter"), query.Term("name", "hunt")]) r2 = s.search(wanted_children) assert r2.scored_length() == 1 assert [hit["id"] for hit in r2] == ["chapter_3"] complex_query = query.And( [children_of_wanted_parents, wanted_children]) r3 = s.search(complex_query) assert r3.scored_length() == 1 assert [hit["id"] for hit in r3] == ["chapter_3"]
class CatalogueResourceSchema(fields.SchemaClass): pk = fields.ID(stored=True, unique=True) vendor_name = fields.ID name = fields.TEXT(stored=True) vendor = fields.TEXT(stored=True, spelling=True) version = fields.TEXT(stored=True) template_uri = fields.STORED type = fields.TEXT(stored=True) creation_date = fields.DATETIME title = fields.TEXT(stored=True, spelling=True) image = fields.STORED smartphoneimage = fields.STORED description = fields.TEXT(stored=True, spelling=True) wiring = fields.TEXT(spelling=True) public = fields.BOOLEAN users = fields.KEYWORD(commas=True) groups = fields.KEYWORD(commas=True) content = fields.NGRAMWORDS()
from whoosh import fields schema = fields.Schema( # Identifier of this entry in the respective DB table id=fields.ID(stored=True), # Name (or names) of this journal/conference/publisher name=fields.NGRAMWORDS(queryor=True, stored=True), # Names of science domains for this entry domains=fields.KEYWORD(commas=True, stored=True), )
def test_ngramwords_tokenizer(): tk = analysis.CommaSeparatedTokenizer() tags = fields.NGRAMWORDS(minsize=3, maxsize=50, tokenizer=tk, stored=True, queryor=True) schema = fields.Schema(tags=tags)