def _parse_and_search( self, searcher: Searcher, content: str, value: str, limit: Optional[int] = None, terms: bool = False, group: Type[Union[AndGroup, OrGroup]] = OrGroup) -> Results: parser = QueryParser(content, self._ix.schema, group=group) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(value) return searcher.search(query, terms=terms, limit=limit)
def search_index(words): xg_duanluo = [] with ix.searcher() as s: qp = QueryParser('duanluo', schema=ix.schema, group=qparser.OrGroup) qp.remove_plugin_class(qparser.WildcardPlugin) qp.add_plugin(qparser.PrefixPlugin()) for word in words: q = qp.parse(u'{}'.format(word)) results = s.search(q, limit=10) for i in results: xg_duanluo.append((i['id'], i['duanluo'])) return xg_duanluo
def basic_search(query, query_parse, group=default_group, facet=default_facet, index=default_index): searcher = index.searcher() parser = QueryParser(query_parse, index.schema, group=group) myquery = parser.parse(query) parser.remove_plugin_class(qparser.PhrasePlugin) parser.add_plugin(qparser.SequencePlugin()) parser.add_plugin(qparser.FuzzyTermPlugin()) results = searcher.search( myquery, limit=None, sortedby=facet) # limit为搜索结果的限制,默认为10,详见博客开头的官方文档 print(results) return results
def search_index(words): xg_words = [] with ix.searcher() as s: qp = QueryParser('section', schema=ix.schema, group=qparser.OrGroup) # 可以使用通配符搜索 qp.remove_plugin_class(qparser.WildcardPlugin) qp.add_plugin(qparser.PrefixPlugin()) for word in words: q = qp.parse(u'{}'.format(word)) results = s.search(q, limit=10) for i in results: xg_words.append(i['section']) return xg_words
def search_index(words): xg_words = [] with ix.searcher() as s: # group=qparser.OrGroup 表示可匹配任意查询词,而不是所有查询词都匹配才能出结果 qp = QueryParser('section', schema=ix.schema, group=qparser.OrGroup) # 下面两行表示可以使用通配符搜索,如"窗前*月光" qp.remove_plugin_class(qparser.WildcardPlugin) qp.add_plugin(qparser.PrefixPlugin()) for word in words: q = qp.parse(u'%s' % word) # limit:代表返回多少条搜索结果 results = s.search(q, limit=10) for i in results: xg_words.append(i['section']) # print (word,i['section']) return xg_words
def __init__(self, index_dir, var_path): self._index = None try: self._index = wh_index.open_dir(index_dir) except wh_index.IndexError: raise IndexError self._var_reader = self._make_var_reader(var_path) op = OperatorsPlugin( And=r"\bAND\b|&", Or=None, # r"\bOR\b|\|", Not=r"\bNOT\b|\s+-", AndMaybe=None, Require=None) parser = QueryParser('content', _schema, termclass=my_variations(self._var_reader)) parser.remove_plugin_class(RangePlugin) parser.remove_plugin_class(BoostPlugin) parser.remove_plugin_class(WildcardPlugin) parser.replace_plugin(op) self._parser = parser parser_wild = QueryParser('content', _schema, termclass=my_variations(self._var_reader)) parser_wild.remove_plugin_class(RangePlugin) parser_wild.remove_plugin_class(BoostPlugin) parser_wild.replace_plugin(op) self._parser_wild = parser_wild op_filter = OperatorsPlugin(And=r"\bAND\b", Or=r"\bOR\b", Not=None, AndMaybe=None, Require=None) asf_parser = QueryParser('asfilter', _schema) asf_parser.replace_plugin(op_filter) self._asf_parser = asf_parser
def search_index(words): xg_part = [] with ix.searcher() as s: # group = qparser.OrGroup 表示可匹配任意查询词,而不是所有查询词都匹配才出结果 qp = QueryParser('part', schema=ix.schema, group=qparser.OrGroup) # 下面两行表示可以使用通配符,如“窗前*月光” qp.remove_plugin_class(qparser.WildcardPlugin) qp.add_plugin(qparser.PrefixPlugin()) # 随机数 num = random.randint(3, 7) for word in words: q = qp.parse(u'%s' % word) # limit 表示多少条搜索结果 results = s.search(q, limit=num) count = 0 for i in results: if count > 0: # 防止等于本身 xg_part.append((i['pid'], i['part'])) count += 1 return xg_part
def __init__(self, index_dir, var_path): self._index = None try: self._index = wh_index.open_dir(index_dir) except wh_index.IndexError: raise IndexError self._var_reader = self._make_var_reader(var_path) op = OperatorsPlugin( And=r"\bAND\b|&", Or=None, # r"\bOR\b|\|", Not=r"\bNOT\b|\s+-", AndMaybe=None, Require=None, ) parser = QueryParser("content", _schema, termclass=my_variations(self._var_reader)) parser.remove_plugin_class(RangePlugin) parser.remove_plugin_class(BoostPlugin) parser.remove_plugin_class(WildcardPlugin) parser.replace_plugin(op) self._parser = parser parser_wild = QueryParser("content", _schema, termclass=my_variations(self._var_reader)) parser_wild.remove_plugin_class(RangePlugin) parser_wild.remove_plugin_class(BoostPlugin) parser_wild.replace_plugin(op) self._parser_wild = parser_wild op_filter = OperatorsPlugin(And=r"\bAND\b", Or=r"\bOR\b", Not=None, AndMaybe=None, Require=None) asf_parser = QueryParser("asfilter", _schema) asf_parser.replace_plugin(op_filter) self._asf_parser = asf_parser
# schema: a :class:`whoosh.fields.Schema` object to use when parsing. The appropriate fields in the schema will be used to # tokenize terms/phrases before they are turned into query objects. # You can specify None for the schema to create a parser that does not analyze the text of the query, usually for testing purposes. parser = QueryParser("content", ix.schema) # ix.schema 和 schema 是相同的东西 print(len(parser.plugins), parser.plugins) # 11 # [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, # <whoosh.qparser.plugins.FieldsPlugin>, <whoosh.qparser.plugins.WildcardPlugin>, <whoosh.qparser.plugins.PhrasePlugin>, # <whoosh.qparser.plugins.RangePlugin>, <whoosh.qparser.plugins.GroupPlugin>, <whoosh.qparser.plugins.OperatorsPlugin>, # <whoosh.qparser.plugins.BoostPlugin>, <whoosh.qparser.plugins.EveryPlugin>] ## default_set(): Returns the default list of plugins to use. print(len(parser.default_set()), parser.default_set()) # 10 # [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, <whoosh.qparser.plugins.FieldsPlugin>, # <whoosh.qparser.plugins.WildcardPlugin>, <whoosh.qparser.plugins.PhrasePlugin>, <whoosh.qparser.plugins.RangePlugin>, # <whoosh.qparser.plugins.GroupPlugin>, <whoosh.qparser.plugins.OperatorsPlugin>, <whoosh.qparser.plugins.BoostPlugin>, # <whoosh.qparser.plugins.EveryPlugin>] parser.remove_plugin_class(whoosh.qparser.plugins.WildcardPlugin) print(len(parser.plugins), len(parser.default_set())) # 10 10 parser.add_plugin(qparser.PrefixPlugin) print(len(parser.plugins), len(parser.default_set())) # 11 10 ## parse(text, normalize=True, debug=False) Parses the input string and returns a :class:`whoosh.query.Query` object/tree. query = parser.parse('document') ## search(q, **kwargs) Runs a :class:`whoosh.query.Query` object on this searcher and returns a :class:`Results` object. # See :doc:`/searching` for more information. results = searcher.search(query) # 检索 "content" 中出现 "document" print( results ) # <Top 1 Results for Term('content', 'document') runtime=0.0015511049998622184> print(type(results)) # <class 'whoosh.searching.Results'> ## 查询方法二: 上面两行是只用方法, 下面一行也形 ## find(defaultfield, querystring, **kwargs)
def predict_TF_IDF(data, docs_per_q): # index docs exclude = set(string.punctuation) res = [] for idx, row in data.iterrows(): print row["id"] # get answers words w_A = set(utils.tokenize(row["answerA"])) w_B = set(utils.tokenize(row["answerB"])) w_C = set(utils.tokenize(row["answerC"])) w_D = set(utils.tokenize(row["answerD"])) sc_A = 0 sc_B = 0 sc_C = 0 sc_D = 0 q_punc = row["question"] # first thing to debug if not working question = "".join(ch for ch in q_punc if ch not in exclude) qp = QueryParser("content", schema=schema, group=qparser.OrGroup) qp.add_plugin(qparser.FuzzyTermPlugin()) qp.remove_plugin_class(qparser.PhrasePlugin) qp.add_plugin(qparser.SequencePlugin()) q = qp.parse(unicode(question, "utf-8")) # q = qp.parse('physics') # cp = qparser.CompoundsPlugin( AndMaybe="&~") with ix.searcher() as s, ix.searcher(weighting=scoring.TF_IDF()) as scoring_searcher_tfidf: results = s.search(q, limit=docs_per_q) """ u_id = unicode(uuid.uuid1()) if not os.path.exists("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id): os.mkdir("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id) q_ix = index.create_in("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id, schema) q_writer = q_ix.writer() for document in results: q_writer.add_document(article_title=document['article_title'], content=document['content']) q_writer.commit() """ # with q_ix.searcher(weighting=scoring.TF_IDF()) as scoring_searcher_tfidf for document in results: doc_parser = QueryParser("content", schema=schema) doc_q = doc_parser.parse(u"article_title:%s" % document["article_title"]) for w in w_A: try: sc_A += ( scoring.TF_IDF() .scorer(scoring_searcher_tfidf, "content", w) .score(doc_q.matcher(scoring_searcher_tfidf)) ) except TermNotFound: pass for w in w_B: try: sc_B += ( scoring.TF_IDF() .scorer(scoring_searcher_tfidf, "content", w) .score(doc_q.matcher(scoring_searcher_tfidf)) ) except TermNotFound: pass for w in w_C: try: sc_C += ( scoring.TF_IDF() .scorer(scoring_searcher_tfidf, "content", w) .score(doc_q.matcher(scoring_searcher_tfidf)) ) except TermNotFound: pass for w in w_D: try: sc_D += ( scoring.TF_IDF() .scorer(scoring_searcher_tfidf, "content", w) .score(doc_q.matcher(scoring_searcher_tfidf)) ) except TermNotFound: pass res.append(["A", "B", "C", "D"][np.argmax([sc_A, sc_B, sc_C, sc_D])]) return res
from whoosh.index import create_in from whoosh.fields import * schema = Schema(title=TEXT(stored=True), content=TEXT) ix = create_in("indexdir", schema) writer = ix.writer() writer.add_document(title=u"First document", content=u"This is the first document we've added!") writer.add_document(title=u"Second document", content=u"The second one is even more interesting!") writer.add_document(title=u"Third document", content=u"letter first, stamp second, mail third") writer.add_document(title=u"Fourth document", content=u"stamp first, mail third") writer.add_document(title=u"Fivth document", content=u"letter first, mail third") writer.add_document(title=u"Sixth document", content=u"letters first, stamps second, mail third") writer.add_document(title=u"Seventh document", content=u"stamp first, letters second, mial third") writer.commit() from whoosh.qparser import QueryParser, FuzzyTermPlugin, PhrasePlugin, SequencePlugin with ix.searcher() as searcher: parser = QueryParser(u"content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) parser.remove_plugin_class(PhrasePlugin) parser.add_plugin(SequencePlugin()) query = parser.parse(u"Apple iphone 6") print query results = searcher.search(query) print "nb of results =", len(results) for r in results: print r
class QAModel: def __init__(self, index_path, index_name, model_config): self._index_path = None self._index_name = None self._model_config = None self._model = None self._whoosh_index = None self._query_parser = None self.index_path = index_path self.index_name = index_name self.model_config = model_config self.load_model() self.load_index() self.create_and_prep_query_parser() @property def index_path(self): return self._index_path @index_path.setter def index_path(self, value): if type(value) == str: self._index_path = value else: ValueError('Index path must be a string') @property def index_name(self): return self._index_name @index_name.setter def index_name(self, value): if type(value) == str: self._index_name = value else: ValueError('Index name must be a string') @property def model_config(self): return self._model_config @model_config.setter def model_config(self, value): if type(value) == str: self._model_config = value else: ValueError('Model config must be a string') @property def model(self): return self._model @model.setter def model(self, value): self._model = value @property def whoosh_index(self): return self._whoosh_index @whoosh_index.setter def whoosh_index(self, value): self._whoosh_index = value @property def query_parser(self): return self._query_parser @query_parser.setter def query_parser(self, value): self._query_parser = value def load_model(self): """ Loads the model specified by model config """ if self.model_config == 'squad.squad_bert': self.model = build_model(configs.squad.squad_bert, download=False) def load_index(self): """ """ self.whoosh_index = whoosh.index.open_dir(self.index_path, indexname=self.index_name) def create_and_prep_query_parser(self): self.query_parser = QueryParser('text', schema=self.whoosh_index.schema, group=OrGroup) # Set up for natural language queries self.query_parser.remove_plugin_class(WildcardPlugin) def answer_question(self, input_query): """ :param input_query: :return: """ with self.whoosh_index.searcher() as searcher: parsed_query = self.query_parser.parse(input_query) search_results = searcher.search(parsed_query, limit=1) top_hit = [hit['text'] for hit in search_results][0] return self.model([top_hit], [input_query])
schema = Schema(title=TEXT(stored=True), content=TEXT) ix = create_in("indexdir", schema) writer = ix.writer() writer.add_document(title=u"First document", content=u"This is the first document we've added!") writer.add_document(title=u"Second document", content=u"The second one is even more interesting!") writer.add_document(title=u"Third document", content=u"letter first, stamp second, mail third") writer.add_document(title=u"Fourth document", content=u"stamp first, mail third") writer.add_document(title=u"Fivth document", content=u"letter first, mail third") writer.add_document(title=u"Sixth document", content=u"letters first, stamps second, mail third") writer.add_document(title=u"Seventh document", content=u"stamp first, letters second, mial third") writer.commit() from whoosh.qparser import QueryParser, FuzzyTermPlugin, PhrasePlugin, SequencePlugin with ix.searcher() as searcher: parser = QueryParser(u"content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) parser.remove_plugin_class(PhrasePlugin) parser.add_plugin(SequencePlugin()) query = parser.parse(u"Apple iphone 6") print query results = searcher.search(query) print "nb of results =", len(results) for r in results: print r
tmp_dir = TemporaryDirectory() schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+")))) ix = create_in(tmp_dir.name, schema) writer = ix.writer() writer.add_document(title=u"First document", path=u"/a", content=u"this/is/a/test.html") writer.add_document(title=u"Second document", path=u"/b", content=u"this/is/a/hello.html hello a yup") writer.add_document(title=u"Second document", path=u"/b", content=u"this is a hello.html hello a yup") writer.commit() from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content", ix.schema) query.remove_plugin_class(qparser.PhrasePlugin) # query.add_plugin(qparser.SequencePlugin("[\w/.]+")) query = query.parse('this/is/a/test.html') print(query) results = searcher.search(query) print(results) print(results[0]) # print(results[1]) if __name__ == '__main__': print('yup') # sleep(20)
# schema: a :class:`whoosh.fields.Schema` object to use when parsing. The appropriate fields in the schema will be used to # tokenize terms/phrases before they are turned into query objects. # You can specify None for the schema to create a parser that does not analyze the text of the query, usually for testing purposes. parser = QueryParser("content", ix.schema) # ix.schema 和 schema 是相同的东西 print(len(parser.plugins), parser.plugins) # 11 # [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, # <whoosh.qparser.plugins.FieldsPlugin>, <whoosh.qparser.plugins.WildcardPlugin>, <whoosh.qparser.plugins.PhrasePlugin>, # <whoosh.qparser.plugins.RangePlugin>, <whoosh.qparser.plugins.GroupPlugin>, <whoosh.qparser.plugins.OperatorsPlugin>, # <whoosh.qparser.plugins.BoostPlugin>, <whoosh.qparser.plugins.EveryPlugin>] ## default_set(): Returns the default list of plugins to use. print(len(parser.default_set()), parser.default_set()) # 10 # [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, <whoosh.qparser.plugins.FieldsPlugin>, # <whoosh.qparser.plugins.WildcardPlugin>, <whoosh.qparser.plugins.PhrasePlugin>, <whoosh.qparser.plugins.RangePlugin>, # <whoosh.qparser.plugins.GroupPlugin>, <whoosh.qparser.plugins.OperatorsPlugin>, <whoosh.qparser.plugins.BoostPlugin>, # <whoosh.qparser.plugins.EveryPlugin>] parser.remove_plugin_class(whoosh.qparser.plugins.WildcardPlugin) print(len(parser.plugins), len(parser.default_set())) # 10 10 parser.add_plugin(qparser.PrefixPlugin) print(len(parser.plugins), len(parser.default_set())) # 11 10 ## parse(text, normalize=True, debug=False) Parses the input string and returns a :class:`whoosh.query.Query` object/tree. query = parser.parse('document') ## search(q, **kwargs) Runs a :class:`whoosh.query.Query` object on this searcher and returns a :class:`Results` object. # See :doc:`/searching` for more information. results = searcher.search(query) # 检索 "content" 中出现 "document" print(results) # <Top 1 Results for Term('content', 'document') runtime=0.0015511049998622184> print(type(results)) # <class 'whoosh.searching.Results'> ## 查询方法二: 上面两行是只用方法, 下面一行也形 ## find(defaultfield, querystring, **kwargs) results = searcher.find("title", "document") # 检索标题中出现 'document' 的文档 print(results) # <Top 2 Results for Term('title', 'document') runtime=0.0008875329999682435>
storage = FileStorage('index') # idx_path 为索引路径 idx1 = storage.open_index(indexname='idx1') from whoosh import index # 方法二 使用open_dir函数 from whoosh.index import open_dir idx2 = open_dir('index', indexname='idx2') # indexname 为索引名 print(index.exists_in('index', indexname='idx2')) pass from whoosh.qparser import QueryParser, MultifieldParser, OrGroup, FieldsPlugin og = OrGroup.factory(0.9) qp = QueryParser("content", schema=idx1.schema) # group=OrGroup qp.remove_plugin_class(FieldsPlugin) q = qp.parse("reset") print(q) # mqp = MultifieldParser(["title", "content"], schema=idx1.schema) # mq = mqp.parse(u"many only") # # from whoosh.query import * # myquery = And([Term("title", u"third"), q]) # # myquery = Term("title", u"ird") # print(myquery) searcher = idx1.searcher() r = (searcher.search(q=q, limit=None)) print(len(r)) for hit in r: t = dict(hit) print(hit)