def __init__(self, doc_path, stopwords=None): st = RamStorage() st.create() schema = Schema(entity1_name=TEXT(stored=True), fieldname=TEXT(stored=True), entity2_name=TEXT(stored=True)) self.ix = st.create_index(schema) writer = self.ix.writer() self.remove_stopwords_while_indexing = False if stopwords: self.remove_stopwords_while_indexing = True self.stopwords_dict = read_file_as_dict(stopwords) with open(doc_path, 'r') as graph_file: reader = csv.DictReader(graph_file, delimiter="\t", fieldnames=['e1_relation', 'e2']) for row in tqdm(reader): entity_relation, e2 = row['e1_relation'], row['e2'] tokens = entity_relation.split() e1 = tokens[1] relation = tokens[2] writer.add_document(entity1_name=e1, fieldname=relation, entity2_name=e2) writer.commit()
def test_finalweighting(): from whoosh.scoring import Frequency schema = fields.Schema(id=fields.ID(stored=True), summary=fields.TEXT, n_comments=fields.STORED) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("1"), summary=u("alfa bravo"), n_comments=5) w.add_document(id=u("2"), summary=u("alfa"), n_comments=12) w.add_document(id=u("3"), summary=u("bravo"), n_comments=2) w.add_document(id=u("4"), summary=u("bravo bravo"), n_comments=7) w.commit() class CommentWeighting(Frequency): use_final = True def final(self, searcher, docnum, score): ncomments = searcher.stored_fields(docnum).get("n_comments", 0) return ncomments with ix.searcher(weighting=CommentWeighting()) as s: r = s.search(qparser.QueryParser("summary", None).parse("alfa OR bravo")) ids = [fs["id"] for fs in r] assert_equal(["2", "4", "1", "3"], ids)
class ToolBoxSearch( object ): """ Support searching tools in a toolbox. This implementation uses the "whoosh" search library. """ def __init__( self, toolbox ): """ Create a searcher for `toolbox`. """ self.toolbox = toolbox self.enabled = tool_search_enabled if tool_search_enabled: self.build_index() def build_index( self ): self.storage = RamStorage() self.index = self.storage.create_index( schema ) writer = self.index.writer() ## TODO: would also be nice to search section headers. for id, tool in self.toolbox.tools_by_id.iteritems(): writer.add_document( id=id, title=to_unicode(tool.name), description=to_unicode(tool.description), help=to_unicode(tool.help) ) writer.commit() def search( self, query, return_attribute='id' ): if not tool_search_enabled: return [] # Change field boosts for searcher to place more weight on title, description than help. searcher = self.index.searcher( \ weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \ ) ) # Set query to search title, description, and help. parser = MultifieldParser( [ 'title', 'description', 'help' ], schema = schema ) results = searcher.search( parser.parse( query ), minscore=2.0 ) return [ result[ return_attribute ] for result in results ]
def test_pages(self): from whoosh.scoring import Frequency schema = fields.Schema(id=fields.ID(stored=True), c=fields.TEXT) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u"1", c=u"alfa alfa alfa alfa alfa alfa") w.add_document(id=u"2", c=u"alfa alfa alfa alfa alfa") w.add_document(id=u"3", c=u"alfa alfa alfa alfa") w.add_document(id=u"4", c=u"alfa alfa alfa") w.add_document(id=u"5", c=u"alfa alfa") w.add_document(id=u"6", c=u"alfa") w.commit() s = ix.searcher(weighting=Frequency) q = query.Term("c", u"alfa") r = s.search(q) self.assertEqual([d["id"] for d in r], ["1", "2", "3", "4", "5", "6"]) r = s.search_page(q, 2, pagelen=2) self.assertEqual([d["id"] for d in r], ["3", "4"]) r = s.search_page(q, 10, pagelen=4) self.assertEqual(r.total, 6) self.assertEqual(r.pagenum, 2) self.assertEqual(r.pagelen, 2)
def test_add_sortable(): st = RamStorage() schema = fields.Schema(chapter=fields.ID(stored=True), price=fields.NUMERIC) ix = st.create_index(schema) with ix.writer() as w: w.add_document(chapter=u("alfa"), price=100) w.add_document(chapter=u("bravo"), price=200) w.add_document(chapter=u("charlie"), price=300) w.add_document(chapter=u("delta"), price=400) with ix.writer() as w: w.add_document(chapter=u("bravo"), price=500) w.add_document(chapter=u("alfa"), price=600) w.add_document(chapter=u("delta"), price=100) w.add_document(chapter=u("charlie"), price=200) w.merge = False with ix.reader() as r: assert not r.has_column("chapter") assert not r.has_column("price") with ix.writer() as w: sorting.add_sortable(w, "chapter", sorting.StoredFieldFacet("chapter")) sorting.add_sortable(w, "price", sorting.FieldFacet("price")) w.schema.test = 100 with ix.reader() as r: assert r.has_column("chapter") assert r.has_column("price") chapr = r.column_reader("chapter") pricer = r.column_reader("price") assert chapr[0] == "alfa" assert pricer[0] == 100
def test_midlevel_writing(): st = RamStorage() schema = fields.Schema(t=fields.TEXT(phrase=False)) ix = st.create_index(schema) w = ix.writer() w.add_document(t=u("alfa bravo charlie delta alfa bravo alfa")) w.commit() with ix.reader() as r: ti = r.termsindex["t", u("alfa")] assert_equal(ti.weight(), 3.0) assert_equal(ti.doc_frequency(), 1) assert_equal(ti.min_length(), 7) assert_equal(ti.max_length(), 7) assert_equal(ti.max_weight(), 3.0) assert_almost_equal(ti.max_wol(), 3.0 / 7) assert_equal(ti.postings, ((0,), (3.0,), (b('\x00\x00\x00\x03'),))) w = ix.writer() w.add_document(t=u("alfa charlie alfa")) w.commit() with ix.reader() as r: ti = r.termsindex["t", u("alfa")] assert_equal(ti.weight(), 5.0) assert_equal(ti.doc_frequency(), 2) assert_equal(ti.min_length(), 3) assert_equal(ti.max_length(), 7) assert_equal(ti.max_weight(), 3.0) assert_almost_equal(ti.max_wol(), 2.0 / 3) assert_equal(ti.postings, 0)
def test_finalweighting(self): from whoosh.scoring import Weighting schema = fields.Schema(id=fields.ID(stored=True), summary=fields.TEXT, n_comments=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u"1", summary=u"alfa bravo", n_comments=u"5") w.add_document(id=u"2", summary=u"alfa", n_comments=u"12") w.add_document(id=u"3", summary=u"bravo", n_comments=u"2") w.add_document(id=u"4", summary=u"bravo bravo", n_comments=u"7") w.commit() class CommentWeighting(Weighting): def score(self, *args, **kwargs): return 0 def final(self, searcher, docnum, score): ncomments = int(searcher.stored_fields(docnum).get("n_comments")) return ncomments s = ix.searcher(weighting=CommentWeighting()) r = s.search(qparser.QueryParser("summary").parse("alfa OR bravo")) ids = [fs["id"] for fs in r] self.assertEqual(ids, ["2", "4", "1", "3"])
def test_not2(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("a"), value=u("alfa bravo charlie delta echo")) writer.add_document(name=u("b"), value=u("bravo charlie delta echo foxtrot")) writer.add_document(name=u("c"), value=u("charlie delta echo foxtrot golf")) writer.add_document(name=u("d"), value=u("delta echo golf hotel india")) writer.add_document(name=u("e"), value=u("echo golf hotel india juliet")) writer.commit() with ix.searcher() as s: p = qparser.QueryParser("value", None) results = s.search(p.parse("echo NOT golf")) assert_equal(sorted([d["name"] for d in results]), ["a", "b"]) results = s.search(p.parse("echo NOT bravo")) assert_equal(sorted([d["name"] for d in results]), ["c", "d", "e"]) ix.delete_by_term("value", u("bravo")) with ix.searcher() as s: results = s.search(p.parse("echo NOT charlie")) assert_equal(sorted([d["name"] for d in results]), ["d", "e"])
def test_missing_field_scoring(): schema = fields.Schema(name=fields.TEXT(stored=True), hobbies=fields.TEXT(stored=True)) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u('Frank'), hobbies=u('baseball, basketball')) writer.commit() r = ix.reader() assert_equal(r.field_length("hobbies"), 2) assert_equal(r.field_length("name"), 1) r.close() writer = ix.writer() writer.add_document(name=u('Jonny')) writer.commit() with ix.searcher() as s: r = s.reader() assert_equal(len(ix._segments()), 1) assert_equal(r.field_length("hobbies"), 2) assert_equal(r.field_length("name"), 2) parser = qparser.MultifieldParser(['name', 'hobbies'], schema) q = parser.parse(u("baseball")) result = s.search(q) assert_equal(len(result), 1)
def test_weighting(self): from whoosh.scoring import Weighting schema = fields.Schema(id=fields.ID(stored=True), n_comments=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u"1", n_comments=u"5") w.add_document(id=u"2", n_comments=u"12") w.add_document(id=u"3", n_comments=u"2") w.add_document(id=u"4", n_comments=u"7") w.commit() class CommentWeighting(Weighting): def score(self, searcher, fieldnum, text, docnum, weight, QTF=1): ncomments = int( searcher.stored_fields(docnum).get("n_comments", "0")) return ncomments s = ix.searcher(weighting=CommentWeighting()) r = s.search(qparser.QueryParser("id").parse("[1 TO 4]")) ids = [fs["id"] for fs in r] self.assertEqual(ids, ["2", "4", "1", "3"])
def search_column_headers(entities, graph, table): #initiallize the Bigram index schema = Schema( title=NGRAMWORDS(minsize=2, maxsize=4, stored=True, field_boost=1.0, tokenizer=None, at='start', queryor=False, sortable=False), uri=TEXT(stored=True) ) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() for e in entities: entity = entities[e] if entity.type != "property": continue for value in entity.values: writer.add_document(title=unicode(value), uri=unicode(e)) writer.commit() #loop the literal colunm headers for column in table.columns: query = column.header qp = QueryParser("title", schema=ix.schema) with ix.searcher() as searcher: for word in query.split(): q = qp.parse(word.strip()) results = searcher.search(q) for result in results: column.candidates.add(result['uri'])
def test_deletion(self): s = fields.Schema(key=fields.ID, name=fields.TEXT, value=fields.TEXT) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(key=u"A", name=u"Yellow brown", value=u"Blue red green purple?") w.add_document(key=u"B", name=u"Alpha beta", value=u"Gamma delta epsilon omega.") w.add_document(key=u"C", name=u"One two", value=u"Three four five.") w.commit() count = ix.delete_by_term("key", u"B") self.assertEqual(count, 1) ix.commit() self.assertEqual(ix.doc_count_all(), 3) self.assertEqual(ix.doc_count(), 2) ix.optimize() self.assertEqual(ix.doc_count(), 2) tr = ix.reader() self.assertEqual(list(tr.lexicon("name")), ["brown", "one", "two", "yellow"]) tr.close()
def test_merged(self): sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(sc) w = ix.writer() w.add_document(id=u"alfa", content=u"alfa") w.add_document(id=u"bravo", content=u"bravo") w.add_document(id=u"charlie", content=u"charlie") w.add_document(id=u"delta", content=u"delta") w.commit() s = ix.searcher() r = s.search(query.Term("content", u"bravo")) self.assertEqual(len(r), 1) self.assertEqual(r[0]["id"], "bravo") w = ix.writer() w.add_document(id=u"echo", content=u"echo") w.commit() self.assertEqual(len(ix.segments), 1) s = ix.searcher() r = s.search(query.Term("content", u"bravo")) self.assertEqual(len(r), 1) self.assertEqual(r[0]["id"], "bravo")
def test_frequency_text(self): s = fields.Schema(content=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content=u"alfa bravo charlie delta echo") w.add_document(content=u"bravo bravo bravo bravo charlie delta delta") w.add_document(content=u"delta echo foxtrot") w.commit() tr = ix.reader() self.assertEqual(tr.doc_frequency("content", u"bravo"), 2) self.assertEqual(tr.frequency("content", u"bravo"), 5) self.assertEqual(tr.doc_frequency("content", u"echo"), 2) self.assertEqual(tr.frequency("content", u"echo"), 2) self.assertEqual(tr.doc_frequency("content", u"alfa"), 1) self.assertEqual(tr.frequency("content", u"alfa"), 1) self.assertEqual(tr.doc_frequency("content", u"delta"), 3) self.assertEqual(tr.frequency("content", u"delta"), 4) self.assertEqual(tr.doc_frequency("content", u"foxtrot"), 1) self.assertEqual(tr.frequency("content", u"foxtrot"), 1) self.assertEqual(tr.doc_frequency("content", u"zulu"), 0) self.assertEqual(tr.frequency("content", u"zulu"), 0) self.assertEqual(list(tr), [(0, u"alfa", 1, 1), (0, u"bravo", 2, 5), (0, u"charlie", 2, 2), (0, u"delta", 3, 4), (0, u"echo", 2, 2), (0, u"foxtrot", 1, 1)]) tr.close()
def test_frequency_keyword(self): s = fields.Schema(content=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content=u"A B C D E") w.add_document(content=u"B B B B C D D") w.add_document(content=u"D E F") w.commit() tr = ix.reader() self.assertEqual(tr.doc_frequency("content", u"B"), 2) self.assertEqual(tr.frequency("content", u"B"), 5) self.assertEqual(tr.doc_frequency("content", u"E"), 2) self.assertEqual(tr.frequency("content", u"E"), 2) self.assertEqual(tr.doc_frequency("content", u"A"), 1) self.assertEqual(tr.frequency("content", u"A"), 1) self.assertEqual(tr.doc_frequency("content", u"D"), 3) self.assertEqual(tr.frequency("content", u"D"), 4) self.assertEqual(tr.doc_frequency("content", u"F"), 1) self.assertEqual(tr.frequency("content", u"F"), 1) self.assertEqual(tr.doc_frequency("content", u"Z"), 0) self.assertEqual(tr.frequency("content", u"Z"), 0) self.assertEqual(list(tr), [(0, u"A", 1, 1), (0, u"B", 2, 5), (0, u"C", 2, 2), (0, u"D", 3, 4), (0, u"E", 2, 2), (0, u"F", 1, 1)]) tr.close()
def test_merged_lengths(self): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(f1=u"A B C", f2=u"X") w.add_document(f1=u"B C D E", f2=u"Y Z") w.commit() w = ix.writer() w.add_document(f1=u"A", f2=u"B C D E X Y") w.add_document(f1=u"B C", f2=u"X") w.commit(NO_MERGE) w = ix.writer() w.add_document(f1=u"A B X Y Z", f2=u"B C") w.add_document(f1=u"Y X", f2=u"A B") w.commit(NO_MERGE) dr = ix.reader() self.assertEqual(dr.stored_fields(0)["f1"], u"A B C") self.assertEqual(dr.doc_field_length(0, "f1"), 3) self.assertEqual(dr.doc_field_length(2, "f2"), 6) self.assertEqual(dr.doc_field_length(4, "f1"), 5) dr.close()
class ToolBoxSearch(object): """ Support searching tools in a toolbox. This implementation uses the "whoosh" search library. """ def __init__(self, toolbox): """ Create a searcher for `toolbox`. """ self.toolbox = toolbox self.build_index() def build_index(self): self.storage = RamStorage() self.index = self.storage.create_index(schema) writer = self.index.writer() ## TODO: would also be nice to search section headers. for id, tool in self.toolbox.tools_by_id.iteritems(): writer.add_document(id=id, title=to_unicode(tool.name), description=to_unicode(tool.description), help=to_unicode(tool.help)) writer.commit() def search(self, query, return_attribute='id'): # Change field boosts for searcher to place more weight on title, description than help. searcher = self.index.searcher( \ weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \ ) ) # Set query to search title, description, and help. parser = MultifieldParser(['title', 'description', 'help'], schema=schema) results = searcher.search(parser.parse(query)) return [result[return_attribute] for result in results]
def test_finalweighting(self): from whoosh.scoring import Weighting schema = fields.Schema(id=fields.ID(stored=True), summary=fields.TEXT, n_comments=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u"1", summary=u"alfa bravo", n_comments=u"5") w.add_document(id=u"2", summary=u"alfa", n_comments=u"12") w.add_document(id=u"3", summary=u"bravo", n_comments=u"2") w.add_document(id=u"4", summary=u"bravo bravo", n_comments=u"7") w.commit() class CommentWeighting(Weighting): def score(self, *args, **kwargs): return 0 def final(self, searcher, docnum, score): ncomments = int( searcher.stored_fields(docnum).get("n_comments")) return ncomments s = ix.searcher(weighting=CommentWeighting()) r = s.search(qparser.QueryParser("summary").parse("alfa OR bravo")) ids = [fs["id"] for fs in r] self.assertEqual(ids, ["2", "4", "1", "3"])
def create_index(cls, app, wh): """Creates and opens an index for the given whoosheer and app. If the index already exists, it just opens it, otherwise it creates it first. :param app: The application instance. :param wh: The whoosheer instance for which a index should be created. """ # TODO: do we really want/need to use camel casing? # everywhere else, there is just .lower() if app.extensions['whooshee']['memory_storage']: storage = RamStorage() index = storage.create_index(wh.schema) assert index return index else: index_path = os.path.join( app.extensions['whooshee']['index_path_root'], getattr(wh, 'index_subdir', cls.camel_to_snake(wh.__name__))) if whoosh.index.exists_in(index_path): index = whoosh.index.open_dir(index_path) else: if not os.path.exists(index_path): os.makedirs(index_path) index = whoosh.index.create_in(index_path, wh.schema) return index
def test_term_inspection(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(schema) writer = ix.writer() writer.add_document(title=u("My document"), content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE")) writer.add_document(title=u("My other document"), content=u("AA AB BB CC EE EE AX AX DD")) writer.commit() reader = ix.reader() assert " ".join(reader.field_terms("content")) == "aa ab ax bb cc dd ee" assert list(reader.expand_prefix("content", "a")) == [b('aa'), b('ab'), b('ax')] assert set(reader.all_terms()) == set([('content', b('aa')), ('content', b('ab')), ('content', b('ax')), ('content', b('bb')), ('content', b('cc')), ('content', b('dd')), ('content', b('ee')), ('title', b('document')), ('title', b('my')), ('title', b('other'))]) # (text, doc_freq, index_freq) assert _fstats(reader.iter_field("content")) == [(b('aa'), 2, 6), (b('ab'), 1, 1), (b('ax'), 1, 2), (b('bb'), 2, 5), (b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)] assert _fstats(reader.iter_field("content", prefix="c")) == [(b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)] assert list(reader.most_frequent_terms("content")) == [(6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')), (2, b('dd'))] assert list(reader.most_frequent_terms("content", prefix="a")) == [(6, b('aa')), (2, b('ax')), (1, b('ab'))] assert list(reader.most_distinctive_terms("content", 3)) == [(1.3862943611198906, b('ax')), (0.6931471805599453, b('ab')), (0.0, b('ee'))]
def test_intersection(): schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(key=u("a"), value=u("alpha bravo charlie delta")) w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo")) w.add_document(key=u("c"), value=u("charlie delta golf hotel")) w.commit() w = ix.writer() w.add_document(key=u("d"), value=u("india alpha bravo charlie")) w.add_document(key=u("e"), value=u("delta bravo india bravo")) w.commit() with ix.searcher() as s: q = And([Term("value", u("bravo")), Term("value", u("delta"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "e"] q = And([Term("value", u("bravo")), Term("value", u("alpha"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "b", "d"]
def test_merged(): sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(sc) w = ix.writer() w.add_document(id=u("alfa"), content=u("alfa")) w.add_document(id=u("bravo"), content=u("bravo")) w.add_document(id=u("charlie"), content=u("charlie")) w.add_document(id=u("delta"), content=u("delta")) w.commit() with ix.searcher() as s: r = s.search(Term("content", u("bravo"))) assert_equal(len(r), 1) assert_equal(r[0]["id"], "bravo") w = ix.writer() w.add_document(id=u("echo"), content=u("echo")) w.commit() assert_equal(len(ix._segments()), 1) with ix.searcher() as s: r = s.search(Term("content", u("bravo"))) assert_equal(len(r), 1) assert_equal(r[0]["id"], "bravo")
def test_weighting(): from whoosh.scoring import Weighting, BaseScorer schema = fields.Schema(id=fields.ID(stored=True), n_comments=fields.STORED) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("1"), n_comments=5) w.add_document(id=u("2"), n_comments=12) w.add_document(id=u("3"), n_comments=2) w.add_document(id=u("4"), n_comments=7) w.commit() # Fake Weighting implementation class CommentWeighting(Weighting): def scorer(self, searcher, fieldname, text, qf=1): return self.CommentScorer(searcher.stored_fields) class CommentScorer(BaseScorer): def __init__(self, stored_fields): self.stored_fields = stored_fields def score(self, matcher): ncomments = self.stored_fields(matcher.id()).get("n_comments", 0) return ncomments with ix.searcher(weighting=CommentWeighting()) as s: q = TermRange("id", u("1"), u("4"), constantscore=False) r = s.search(q) ids = [fs["id"] for fs in r] assert_equal(ids, ["2", "4", "1", "3"])
class ToolBoxSearch( object ): """ Support searching tools in a toolbox. This implementation uses the Whoosh search library. """ def __init__( self, toolbox, index_help=True ): """ Create a searcher for `toolbox`. """ self.toolbox = toolbox self.build_index( index_help ) def build_index( self, index_help=True ): log.debug( 'Starting to build toolbox index.' ) self.storage = RamStorage() self.index = self.storage.create_index( schema ) writer = self.index.writer() for id, tool in self.toolbox.tools(): # Do not add data managers to the public index if tool.tool_type == 'manage_data': continue add_doc_kwds = { "id": id, "name": to_unicode( tool.name ), "description": to_unicode( tool.description ), "section": to_unicode( tool.get_panel_section()[1] if len( tool.get_panel_section() ) == 2 else '' ), "help": to_unicode( "" ) } if tool.labels: add_doc_kwds['labels'] = to_unicode( " ".join( tool.labels ) ) if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode( tool.help.render( host_url="", static_path="" ) ) except Exception: # Don't fail to build index just because a help message # won't render. pass writer.add_document( **add_doc_kwds ) writer.commit() log.debug( 'Toolbox index finished.' ) def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_help_boost, tool_search_limit ): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={ 'name_B': float( tool_name_boost ), 'section_B': float( tool_section_boost ), 'description_B': float( tool_description_boost ), 'help_B': float( tool_help_boost ) } ) ) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels' ], schema=schema ) # Perform the search hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) ) return [ hit[ 'id' ] for hit in hits ]
def test_writer_delete(self): s = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(key=u"1", value=u"alfa") w.add_document(key=u"2", value=u"bravo") w.add_document(key=u"3", value=u"charlie") w.commit() s = ix.searcher() self.assertEqual(s.document(key=u"1")["value"], "alfa") self.assertEqual(s.document(key=u"2")["value"], "bravo") self.assertEqual(s.document(key=u"3")["value"], "charlie") s.close() from whoosh.filedb.filewriting import OPTIMIZE w = ix.writer() w.delete_by_term("key", u"2") w.commit(OPTIMIZE) s = ix.searcher() self.assertEqual(s.document(key=u"1")["value"], "alfa") self.assertEqual(s.document(key=u"3")["value"], "charlie") self.assertEqual(list(s.reader().lexicon("key")), ["1", "3"]) s.close()
def test_intersection(self): schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(key=u"a", value=u"alpha bravo charlie delta") w.add_document(key=u"b", value=u"echo foxtrot alpha bravo") w.add_document(key=u"c", value=u"charlie delta golf hotel") w.commit() w = ix.writer() w.add_document(key=u"d", value=u"india alpha bravo charlie") w.add_document(key=u"e", value=u"delta bravo india bravo") w.commit() searcher = ix.searcher() q = And([Term("value", u"bravo"), Term("value", u"delta")]) sc = q.scorer(searcher) self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "e"]) q = And([Term("value", u"bravo"), Term("value", u"alpha")]) sc = q.scorer(searcher) self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "b", "d"])
def create_index(cls, app, wh): """Creates and opens an index for the given whoosheer and app. If the index already exists, it just opens it, otherwise it creates it first. :param app: The application instance. :param wh: The whoosheer instance for which a index should be created. """ # TODO: do we really want/need to use camel casing? # everywhere else, there is just .lower() if app.extensions['whooshee']['memory_storage']: storage = RamStorage() index = storage.create_index(wh.schema) assert index return index else: index_path = os.path.join(app.extensions['whooshee']['index_path_root'], getattr(wh, 'index_subdir', cls.camel_to_snake(wh.__name__))) if whoosh.index.exists_in(index_path): index = whoosh.index.open_dir(index_path) else: if not os.path.exists(index_path): os.makedirs(index_path) index = whoosh.index.create_in(index_path, wh.schema) return index
def test_datetime(): dtf = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=dtf) st = RamStorage() ix = st.create_index(schema) w = ix.writer() for month in xrange(1, 12): for day in xrange(1, 28): w.add_document(id=u("%s-%s") % (month, day), date=datetime(2010, month, day, 14, 0, 0)) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("date:20100523")) assert len(r) == 1 assert r[0]["id"] == "5-23" assert r[0]["date"].__class__ is datetime assert r[0]["date"].month == 5 assert r[0]["date"].day == 23 r = s.search(qp.parse("date:'2010 02'")) assert len(r) == 27 q = qp.parse(u("date:[2010-05 to 2010-08]")) startdt = datetime(2010, 5, 1, 0, 0, 0, 0) enddt = datetime(2010, 8, 31, 23, 59, 59, 999999) assert q.__class__ is query.NumericRange assert q.start == times.datetime_to_long(startdt) assert q.end == times.datetime_to_long(enddt)
def test_term_inspection(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(schema) writer = ix.writer() writer.add_document(title=u("My document"), content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE")) writer.add_document(title=u("My other document"), content=u("AA AB BB CC EE EE AX AX DD")) writer.commit() reader = ix.reader() assert_equal(list(reader.lexicon("content")), [u('aa'), u('ab'), u('ax'), u('bb'), u('cc'), u('dd'), u('ee')]) assert_equal(list(reader.expand_prefix("content", "a")), [u('aa'), u('ab'), u('ax')]) assert (set(reader.all_terms()) == set([('content', u('aa')), ('content', u('ab')), ('content', u('ax')), ('content', u('bb')), ('content', u('cc')), ('content', u('dd')), ('content', u('ee')), ('title', u('document')), ('title', u('my')), ('title', u('other'))])) # (text, doc_freq, index_freq) assert_equal(_fstats(reader.iter_field("content")), [(u('aa'), 2, 6), (u('ab'), 1, 1), (u('ax'), 1, 2), (u('bb'), 2, 5), (u('cc'), 2, 3), (u('dd'), 2, 2), (u('ee'), 2, 4)]) assert_equal(_fstats(reader.iter_field("content", prefix="c")), [(u('cc'), 2, 3), (u('dd'), 2, 2), (u('ee'), 2, 4)]) assert_equal(list(reader.most_frequent_terms("content")), [(6, u('aa')), (5, u('bb')), (4, u('ee')), (3, u('cc')), (2, u('dd'))]) assert_equal(list(reader.most_frequent_terms("content", prefix="a")), [(6, u('aa')), (2, u('ax')), (1, u('ab'))])
def test_frequency_keyword(): s = fields.Schema(content=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content=u("A B C D E")) w.add_document(content=u("B B B B C D D")) w.add_document(content=u("D E F")) w.commit() with ix.reader() as tr: assert tr.doc_frequency("content", u("B")) == 2 assert tr.frequency("content", u("B")) == 5 assert tr.doc_frequency("content", u("E")) == 2 assert tr.frequency("content", u("E")) == 2 assert tr.doc_frequency("content", u("A")) == 1 assert tr.frequency("content", u("A")) == 1 assert tr.doc_frequency("content", u("D")) == 3 assert tr.frequency("content", u("D")) == 4 assert tr.doc_frequency("content", u("F")) == 1 assert tr.frequency("content", u("F")) == 1 assert tr.doc_frequency("content", u("Z")) == 0 assert tr.frequency("content", u("Z")) == 0 stats = [(fname, text, ti.doc_frequency(), ti.weight()) for (fname, text), ti in tr] assert stats == [("content", b("A"), 1, 1), ("content", b("B"), 2, 5), ("content", b("C"), 2, 2), ("content", b("D"), 3, 4), ("content", b("E"), 2, 2), ("content", b("F"), 1, 1)]
def test_stored_fields(): s = fields.Schema(a=fields.ID(stored=True), b=fields.STORED, c=fields.KEYWORD, d=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(s) writer = ix.writer() writer.add_document(a=u("1"), b="a", c=u("zulu"), d=u("Alfa")) writer.add_document(a=u("2"), b="b", c=u("yankee"), d=u("Bravo")) writer.add_document(a=u("3"), b="c", c=u("xray"), d=u("Charlie")) writer.commit() with ix.searcher() as sr: assert sr.stored_fields(0) == {"a": u("1"), "b": "a", "d": u("Alfa")} assert sr.stored_fields(2) == { "a": u("3"), "b": "c", "d": u("Charlie") } assert sr.document(a=u("1")) == {"a": u("1"), "b": "a", "d": u("Alfa")} assert sr.document(a=u("2")) == { "a": u("2"), "b": "b", "d": u("Bravo") }
def test_frequency_text(): s = fields.Schema(content=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content=u("alfa bravo charlie delta echo")) w.add_document(content=u("bravo bravo bravo bravo charlie delta delta")) w.add_document(content=u("delta echo foxtrot")) w.commit() with ix.reader() as tr: assert tr.doc_frequency("content", u("bravo")) == 2 assert tr.frequency("content", u("bravo")) == 5 assert tr.doc_frequency("content", u("echo")) == 2 assert tr.frequency("content", u("echo")) == 2 assert tr.doc_frequency("content", u("alfa")) == 1 assert tr.frequency("content", u("alfa")) == 1 assert tr.doc_frequency("content", u("delta")) == 3 assert tr.frequency("content", u("delta")) == 4 assert tr.doc_frequency("content", u("foxtrot")) == 1 assert tr.frequency("content", u("foxtrot")) == 1 assert tr.doc_frequency("content", u("zulu")) == 0 assert tr.frequency("content", u("zulu")) == 0 stats = [(fname, text, ti.doc_frequency(), ti.weight()) for (fname, text), ti in tr] assert stats == [("content", b("alfa"), 1, 1), ("content", b("bravo"), 2, 5), ("content", b("charlie"), 2, 2), ("content", b("delta"), 3, 4), ("content", b("echo"), 2, 2), ("content", b("foxtrot"), 1, 1)]
def _create_index(): s = fields.Schema(f1=fields.KEYWORD(stored=True), f2=fields.KEYWORD, f3=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) return ix
def test_intersection(self): schema = fields.Schema(key = fields.ID(stored=True), value = fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(key=u"a", value=u"alpha bravo charlie delta") w.add_document(key=u"b", value=u"echo foxtrot alpha bravo") w.add_document(key=u"c", value=u"charlie delta golf hotel") w.commit() w = ix.writer() w.add_document(key=u"d", value=u"india alpha bravo charlie") w.add_document(key=u"e", value=u"delta bravo india bravo") w.commit() searcher = ix.searcher() q = And([Term("value", u"bravo"), Term("value", u"delta")]) sc = q.scorer(searcher) self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "e"]) q = And([Term("value", u"bravo"), Term("value", u"alpha")]) sc = q.scorer(searcher) self.assertEqual(self._keys(searcher, sc.all_ids()), ["a", "b", "d"])
def _create_index(): s = fields.Schema(f1 = fields.KEYWORD(stored = True), f2 = fields.KEYWORD, f3 = fields.KEYWORD) st = RamStorage() ix = st.create_index(s) return ix
def test_frequency_text(self): s = fields.Schema(content = fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content = u"alfa bravo charlie delta echo") w.add_document(content = u"bravo bravo bravo bravo charlie delta delta") w.add_document(content = u"delta echo foxtrot") w.commit() tr = ix.reader() self.assertEqual(tr.doc_frequency("content", u"bravo"), 2) self.assertEqual(tr.frequency("content", u"bravo"), 5) self.assertEqual(tr.doc_frequency("content", u"echo"), 2) self.assertEqual(tr.frequency("content", u"echo"), 2) self.assertEqual(tr.doc_frequency("content", u"alfa"), 1) self.assertEqual(tr.frequency("content", u"alfa"), 1) self.assertEqual(tr.doc_frequency("content", u"delta"), 3) self.assertEqual(tr.frequency("content", u"delta"), 4) self.assertEqual(tr.doc_frequency("content", u"foxtrot"), 1) self.assertEqual(tr.frequency("content", u"foxtrot"), 1) self.assertEqual(tr.doc_frequency("content", u"zulu"), 0) self.assertEqual(tr.frequency("content", u"zulu"), 0) self.assertEqual(list(tr), [(0, u"alfa", 1, 1), (0, u"bravo", 2, 5), (0, u"charlie", 2, 2), (0, u"delta", 3, 4), (0, u"echo", 2, 2), (0, u"foxtrot", 1, 1)]) tr.close()
def test_frequency_keyword(self): s = fields.Schema(content = fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content = u"A B C D E") w.add_document(content = u"B B B B C D D") w.add_document(content = u"D E F") w.commit() tr = ix.reader() self.assertEqual(tr.doc_frequency("content", u"B"), 2) self.assertEqual(tr.frequency("content", u"B"), 5) self.assertEqual(tr.doc_frequency("content", u"E"), 2) self.assertEqual(tr.frequency("content", u"E"), 2) self.assertEqual(tr.doc_frequency("content", u"A"), 1) self.assertEqual(tr.frequency("content", u"A"), 1) self.assertEqual(tr.doc_frequency("content", u"D"), 3) self.assertEqual(tr.frequency("content", u"D"), 4) self.assertEqual(tr.doc_frequency("content", u"F"), 1) self.assertEqual(tr.frequency("content", u"F"), 1) self.assertEqual(tr.doc_frequency("content", u"Z"), 0) self.assertEqual(tr.frequency("content", u"Z"), 0) self.assertEqual(list(tr), [(0, u"A", 1, 1), (0, u"B", 2, 5), (0, u"C", 2, 2), (0, u"D", 3, 4), (0, u"E", 2, 2), (0, u"F", 1, 1)]) tr.close()
def test_merged_lengths(self): s = fields.Schema(f1 = fields.KEYWORD(stored = True, scorable = True), f2 = fields.KEYWORD(stored = True, scorable = True)) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(f1 = u"A B C", f2 = u"X") w.add_document(f1 = u"B C D E", f2 = u"Y Z") w.commit() w = ix.writer() w.add_document(f1 = u"A", f2 = u"B C D E X Y") w.add_document(f1 = u"B C", f2 = u"X") w.commit(NO_MERGE) w = ix.writer() w.add_document(f1 = u"A B X Y Z", f2 = u"B C") w.add_document(f1 = u"Y X", f2 = u"A B") w.commit(NO_MERGE) dr = ix.reader() self.assertEqual(dr.stored_fields(0)["f1"], u"A B C") self.assertEqual(dr.doc_field_length(0, "f1"), 3) self.assertEqual(dr.doc_field_length(2, "f2"), 6) self.assertEqual(dr.doc_field_length(4, "f1"), 5) dr.close()
class MemoryCodec(base.Codec): def __init__(self): from whoosh.filedb.filestore import RamStorage self.storage = RamStorage() self.segment = MemSegment(self, "blah") def writer(self, schema): ix = self.storage.create_index(schema) return MemWriter(ix, _lk=False, codec=self, docbase=self.segment._doccount) def reader(self, schema): return SegmentReader(self.storage, schema, self.segment, codec=self) def per_document_writer(self, storage, segment): return MemPerDocWriter(self.storage, self.segment) def field_writer(self, storage, segment): return MemFieldWriter(self.storage, self.segment) def per_document_reader(self, storage, segment): return MemPerDocReader(self.storage, self.segment) def terms_reader(self, storage, segment): return MemTermsReader(self.storage, self.segment) def new_segment(self, storage, indexname): return self.segment
def test_random_intersections(self): vals = [ u"alpha", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot", u"golf", u"hotel", u"india", u"juliet", u"kilo", u"lima", u"mike" ] segments = 5 docsperseg = 50 fieldlimits = (3, 10) documents = [] schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) docnum = 0 for segnum in xrange(segments): w = ix.writer() for d in xrange(docsperseg): doc = u" ".join( choice(vals) for _ in xrange(randint(*fieldlimits))) w.add_document(key=unicode(docnum), value=doc) documents.append((str(docnum), doc)) docnum += 1 w.commit() self.assertNotEqual(len(ix.segments), 1) testcount = 50 testlimits = (2, 5) searcher = ix.searcher() for testnum in xrange(testcount): matches = [] while not matches: targets = sample(vals, randint(*testlimits)) for docnum, doc in documents: if all((doc.find(target) > -1) for target in targets): matches.append(docnum) matches.sort() q = And([Term("value", target) for target in targets]) sc = q.scorer(searcher) #t1 = now() ids1 = list(sc.all_ids()) #t1 = now() - t1 sc.reset() #t2 = now() ids2 = [] while sc.id is not None: ids2.append(sc.id) sc.next() #t2 = now() - t2 #print "t2=", t2 self.assertEqual(ids1, ids2) #print t1, t2, t1/t2*100 keys = self._keys(searcher, ids1) self.assertEqual(keys, matches)
class ToolBoxSearch( object ): """ Support searching tools in a toolbox. This implementation uses the Whoosh search library. """ def __init__( self, toolbox, index_help=True ): """ Create a searcher for `toolbox`. """ self.toolbox = toolbox self.build_index( index_help ) def build_index( self, index_help=True ): log.debug( 'Starting to build toolbox index.' ) self.storage = RamStorage() self.index = self.storage.create_index( schema ) writer = self.index.writer() for id, tool in self.toolbox.tools(): # Do not add data managers to the public index if tool.tool_type == 'manage_data': continue add_doc_kwds = { "id": id, "name": to_unicode( tool.name ), "description": to_unicode( tool.description ), "section": to_unicode( tool.get_panel_section()[1] if len( tool.get_panel_section() ) == 2 else '' ), "help": to_unicode( "" ) } if index_help and tool.help: try: add_doc_kwds['help'] = to_unicode( tool.help.render( host_url="", static_path="" ) ) except Exception: # Don't fail to build index just because a help message # won't render. pass writer.add_document( **add_doc_kwds ) writer.commit() log.debug( 'Toolbox index finished.' ) def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_help_boost, tool_search_limit ): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={ 'name_B': float( tool_name_boost ), 'section_B': float( tool_section_boost ), 'description_B': float( tool_description_boost ), 'help_B': float( tool_help_boost ) } ) ) # Set query to search name, description, section, and help. parser = MultifieldParser( [ 'name', 'description', 'section', 'help' ], schema=schema ) # Perform the search hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) ) return [ hit[ 'id' ] for hit in hits ]
def create_tenant_schema(tenants): tenant_schema = Schema(name=TEXT(stored=True), id=NUMERIC(stored=True)) tenant_storage = RamStorage() tenant_ix = tenant_storage.create_index(tenant_schema) tenant_writer = tenant_ix.writer() for t in tenants: tenant_writer.add_document(id=t["id"], name=t["name"].lower()) tenant_writer.commit() return tenant_ix
def __init__(self): tenants = Tenant.objects.all().order_by('name') tenant_schema = Schema(name=TEXT(stored=True), id=NUMERIC(stored=True)) tenant_storage = RamStorage() tenant_ix = tenant_storage.create_index(tenant_schema) tenant_writer = tenant_ix.writer() for t in tenants: tenant_writer.add_document(id=t.id, name=t.name.lower()) tenant_writer.commit() self.index = tenant_ix
def test_random_intersections(self): vals = [u"alpha", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot", u"golf", u"hotel", u"india", u"juliet", u"kilo", u"lima", u"mike"] segments = 5 docsperseg = 50 fieldlimits = (3, 10) documents = [] schema = fields.Schema(key = fields.ID(stored=True), value = fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) docnum = 0 for segnum in xrange(segments): w = ix.writer() for d in xrange(docsperseg): doc = u" ".join(choice(vals) for _ in xrange(randint(*fieldlimits))) w.add_document(key=unicode(docnum), value = doc) documents.append((str(docnum), doc)) docnum += 1 w.commit() self.assertNotEqual(len(ix.segments), 1) testcount = 50 testlimits = (2, 5) searcher = ix.searcher() for testnum in xrange(testcount): matches = [] while not matches: targets = sample(vals, randint(*testlimits)) for docnum, doc in documents: if all((doc.find(target) > -1) for target in targets): matches.append(docnum) matches.sort() q = And([Term("value", target) for target in targets]) sc = q.scorer(searcher) #t1 = now() ids1 = list(sc.all_ids()) #t1 = now() - t1 sc.reset() #t2 = now() ids2 = [] while sc.id is not None: ids2.append(sc.id) sc.next() #t2 = now() - t2 #print "t2=", t2 self.assertEqual(ids1, ids2) #print t1, t2, t1/t2*100 keys = self._keys(searcher, ids1) self.assertEqual(keys, matches)
class WhooshGuess(object): def __init__(self): self.storage = RamStorage() schema = Schema(key=ID(stored=True), \ ask=BOOLEAN(stored=True), \ content=TEXT(stored=True, analyzer=RegexTokenizer())) self.ix = self.storage.create_index(schema) self.writer = self.ix.writer() self.is_train = False for s in greeting.split('\n'): self.train(u'matchinggreeting', s) @property def is_ok(self): return self.is_train def train(self, key, line): splits = u' '.join(list(lang.tokenizezh(line))) ask = lang.is_question(key) #print ask #print splits self.writer.add_document(key=key, content=splits, ask=ask) def train_ok(self): self.writer.commit(optimize=True) self.searcher = self.ix.searcher() self.parser = QueryParser("content", schema=self.ix.schema) self.is_train = True def guess(self, s, is_ask = None): assert(self.is_train) keys = list(lang.keyword(s)) if len(keys) == 0: return '' # MUST contain the keys keys = u' '.join(keys) splits = u' '.join(list(lang.tokenizezh(s))) #q = self.parser.parse(splits + ' OR ' + keys) q1 = self.parser.parse(keys) q2 = self.parser.parse(splits) q = q1 | q2 #print unicode(q) if not is_ask: ask = query.Term(u"ask", lang.is_question(s)) else: ask = query.Term(u"ask", is_ask) results = self.searcher.search(q, filter=ask) for hit in results: return hit['key'] return ''