def search(request): indexNewsObject = IndexNews() ix = indexNewsObject.ix if request.method == 'POST': inputQuery = request.POST['inputQuerySearchPage'] request.session['inputQuery'] = inputQuery if inputQuery == '': context = { 'message' : 'لطفا عبارت مورد نظر خود را وارد کنید' } return render(request,'searchPage/searchPage.html',context=context) else: # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup) # queryParser = MultifieldParser(['title','content'],schema=ix.schema,group=OrGroup) queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema) query = queryParser.parse(inputQuery) with ix.searcher(weighting=scoring.BM25F()) as searcher: results = searcher.search(query,terms=True,limit=None) #for customize html tag form highlight matched terms htmlFormat = highlight.HtmlFormatter('b') results.formatter = htmlFormat results.fragmenter.maxchars = 300 results.fragmenter.surround = 150 paginator = Paginator(results,15) page = request.GET.get('page') resultWithPage = paginator.get_page(page) context = { 'results':resultWithPage, 'inputQuery':inputQuery } return render(request,'searchPage/searchPage.html',context=context) else: inputQuery = request.session['inputQuery'] # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup) queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema) query = queryParser.parse(inputQuery) with ix.searcher(weighting=scoring.BM25F()) as searcher: results = searcher.search(query,terms=True,limit=None) #for customize html tag form highlight matched terms htmlFormat = highlight.HtmlFormatter('b') results.formatter = htmlFormat results.fragmenter.maxchars = 300 results.fragmenter.surround = 150 paginator = Paginator(results,15) page = request.GET.get('page') resultWithPage = paginator.get_page(page) context = { 'results':resultWithPage, 'inputQuery':inputQuery } return render(request,'searchPage/searchPage.html',context=context)
def test_maxclasses(): terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2) htext = highlight.highlight(_doc, terms, sa, cf, hf) assert htext == '<b class="match t0">alfa</b> <b class="match t1">bravo</b> <b class="match t0">charlie</b>...<b class="match t1">delta</b> <b class="match t0">echo</b> foxtrot'
def test_html_format(): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter() htext = highlight.highlight(_doc, terms, sa, cf, hf) assert htext == 'alfa <strong class="match term0">bravo</strong> charlie...hotel <strong class="match term1">india</strong> juliet'
def test_correct_query(): schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(a=u("alfa bravo charlie delta")) w.add_document(a=u("delta echo foxtrot golf")) w.add_document(a=u("golf hotel india juliet")) w.add_document(a=u("juliet kilo lima mike")) w.commit() s = ix.searcher() qp = QueryParser("a", ix.schema) qtext = u('alpha ("brovo november" OR b:dolta) detail') q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__( ) == '(a:alfa AND (a:"bravo november" OR b:dolta) AND a:detail)' assert c.string == 'alfa ("bravo november" OR b:dolta) detail' qtext = u('alpha b:("brovo november" a:delta) detail') q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__( ) == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)' assert c.string == 'alfa b:("brovo november" a:delta) detail' hf = highlight.HtmlFormatter(classname="c") assert c.format_string( hf ) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
def get_html_correction(searcher, query_str, qp): exact_qp = QueryParser('exact', my_index.search_schema) exact_qp.add_plugin(DateParserPlugin()) exact_qp = exact_qp.parse(query_str) try: corrected_query = searcher.correct_query(exact_qp, query_str, prefix=1) except: return "" for token in corrected_query.tokens: # is this some sort of bug with Whoosh? startchar:8, endchar:9 original:'tes?' the hell? if query_str[token.startchar:token.endchar] != token.original: return "" for variations in (uk_variations, us_variations): if token.original in variations and searcher.ixreader.frequency( 'exact', variations[token.original]) > 0: token.text = variations[token.original] break # not sure this code ever gets a chance to run due to above possible bug if re.search(r'\W', token.original): token.text = token.original corrected_query_str = replace_tokens(query_str, corrected_query.tokens) corrected_qp = QueryParser('stemmed', my_index.search_schema) corrected_qp.add_plugin(DateParserPlugin()) corrected_qp = corrected_qp.parse(corrected_query_str) if corrected_qp == qp: return "" result = '<h3>Did you mean <a href="{}">{}</a>?</strong></h3>'.format( stateful_url_for('search_form', q_query=urlize(corrected_query_str)), corrected_query.format_string( highlight.HtmlFormatter(classname="change"))) return result
def render_results(s, qs, template): qp = qparser.QueryParser("content", s.schema) qp = qparser.MultifieldParser(["tgrams", "content"], s.schema) # Add the DateParserPlugin to the parser qp.add_plugin(DateParserPlugin()) q = qp.parse(qs) results = s.search(q, limit=100) results = s.search(q, limit=100, sortedby="title", reverse=True) results = s.search(q, limit=100, groupedby="chapter") q = results.q hf = highlight.HtmlFormatter() results.highlighter = highlight.Highlighter(formatter=hf) qc = None if not results: corrected = s.correct_query(q, qs, prefix=1) if corrected.query != q: qc = corrected.format_string(hf) def hilite(hit): with open(SOURCEDIR + hit["path"], "rb") as hitfile: text = hitfile.read().decode("utf-8") return hit.highlights("content", text) return render_template(template, qs=qs, q=q, results=results, hilite=hilite, corrected=qc, args=request.args)
def test_correct_spell_field(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document(text=u"rendering shading modeling reactions") with ix.searcher() as s: text = s.schema["text"] spell_text = s.schema["spell_text"] r = s.reader() words = [text.from_bytes(t) for t in r.lexicon("text")] assert words == ["model", "reaction", "render", "shade"] words = [spell_text.from_bytes(t) for t in r.lexicon("spell_text")] assert words == ["modeling", "reactions", "rendering", "shading"] qp = QueryParser("text", s.schema) qtext = u"renderink" q = qp.parse(qtext, s.schema) r = s.search(q) assert len(r) == 0 c = s.correct_query(q, qtext) assert c.string == "rendering" assert c.query == query.Term("text", "rendering") hf = highlight.HtmlFormatter(classname="c") assert c.format_string( hf) == '<strong class="c term0">rendering</strong>'
def test_correct_query(): schema = fields.Schema(a=fields.TEXT(), b=fields.TEXT) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document(a=u"alfa bravo charlie delta") w.add_document(a=u"delta echo foxtrot golf") w.add_document(a=u"golf hotel india juliet") w.add_document(a=u"juliet kilo lima mike") with ix.searcher() as s: qp = QueryParser("a", ix.schema) qtext = u'alpha ("brovo november" OR b:dolta) detail' q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) cq = c.query assert isinstance(cq, query.And) assert cq[0].text == "alfa" assert isinstance(cq[1], query.Or) assert isinstance(cq[1][0], query.Phrase) assert cq[1][0].words == ["bravo", "november"] qtext = u'alpha b:("brovo november" a:delta) detail' q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__( ) == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)' assert c.string == 'alfa b:("brovo november" a:delta) detail' hf = highlight.HtmlFormatter(classname="c") assert c.format_string( hf ) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
def create_search_result(self, results): # Allow larger fragments results.fragmenter.maxchars = 300 # Show more context before and after results.fragmenter.surround = 50 # Set result formatter results.formatter = hl.HtmlFormatter(tagname="mark") search_results = [] for r in results: sr = SearchResult() sr.score = r.score sr.tags = r["tags"] sr.path = r["path"] sr.content = r["content"] highlights = r.highlights("content") if not highlights: highlights = self.cap(r["content"], 500) # unescape highlights = self.html_parser.unescape(highlights) html = self.markdown(highlights) sr.content_highlight = html if "headlines" in r: sr.headlines = r["headlines"] search_results.append(sr) return search_results
def test_html_escape(): terms = frozenset(["bravo"]) sa = analysis.StandardAnalyzer() wf = highlight.WholeFragmenter() hf = highlight.HtmlFormatter() htext = highlight.highlight(u('alfa <bravo "charlie"> delta'), terms, sa, wf, hf) assert htext == 'alfa <<strong class="match term0">bravo</strong> "charlie"> delta'
def test_query_highlight(): qp = QueryParser("a", None) hf = highlight.HtmlFormatter() def do(text, terms): q = qp.parse(text) tks = [tk for tk in q.all_tokens() if tk.text in terms] for tk in tks: if tk.startchar is None or tk.endchar is None: assert False, tk fragment = highlight.Fragment(text, tks) return hf.format_fragment(fragment) assert do("a b c d", ["b"]) == 'a <strong class="match term0">b</strong> c d' assert do( 'a (x:b OR y:"c d") e', ("b", "c") ) == 'a (x:<strong class="match term0">b</strong> OR y:"<strong class="match term1">c</strong> d") e'
def search(query, page=1, per_page=20): with index.searcher() as s: qp = qparser.MultifieldParser(['title', 'content'], index.schema) q = qp.parse(unicode(query)) try: result_page = s.search_page(q, page, pagelen=per_page) except ValueError: if page == 1: return SearchResultPage(None, page) return None results = result_page.results results.highlighter.fragmenter.maxchars = 512 results.highlighter.fragmenter.surround = 40 results.highlighter.formatter = highlight.HtmlFormatter( 'em', classname='search-match', termclass='search-term', between=u'<span class=ellipsis> … </span>') return SearchResultPage(result_page, page)
def get_search_result(self, kws_query, page=1, page_len=10): page -= 1 res = [] score_docs = kws_query[1] score_docs.fragmenter.maxchars = cfg.max_result_return score_docs.fragmenter.surround = cfg.preview_surround_length score_docs.formatter = highlight.HtmlFormatter() try: for i in range(page * page_len, (page + 1) * page_len): score_doc = score_docs[i] info = self.content_reader.read(score_doc["store_path"]) if info is None: continue title_start = info['content'].find('<title>') title_end = info['content'].find('</title>') if title_start != -1 and title_end != -1: title = info['content'][title_start + 7:title_end] else: title = kws_query[0] if len(title) > cfg.max_title_length: title = title[:cfg.max_title_length] + '.....' text = helper.remove_html_js(info['content']) class_label = self.bayesData.contextTest(text) preview = score_doc.highlights("content", text=text) res.append({ 'url': score_doc["url"], 'title': title, 'preview': preview, 'classLable': class_label, 'snapshot': '' }) except Exception, e: print "Get search result failed", e
def base_query(): assert request.path == '/index' #print(dict(request.form)["query"][0]) #print(dict(request.form)) query_sentence = str(dict(request.form)["query"][0]) logging.info("Query sentence: %s" % query_sentence) res = [] with ix.searcher() as searcher: # 对输入的查询文本进行解析,如果存在按域查询的需求则区分按域查询,默认采用多属性查询模式 # mark 表示是否需要高亮学院查询区域,默认情况下需要 highlight_xy = True # 默认的多域查询 query = qparser.MultifieldParser( ["content", "title", "mtext", "xueyuan"], ix.schema) if query_sentence.endswith("$姓名$"): # 按名字查询 query = qparser.SimpleParser("title", ix.schema) query_sentence = query_sentence.strip('$姓名$') elif query_sentence.endswith("$学院$"): # 按学院查询 query = qparser.SimpleParser("xueyuan", ix.schema) query_sentence = query_sentence.strip('$学院$') elif query_sentence.endswith("$网页$"): # 按网页内容查询 query = qparser.SimpleParser("content", ix.schema) query_sentence = query_sentence.strip('$网页$') #print(query_sentence) # 引入查询解析器插件 query.add_plugin(qparser.WildcardPlugin) # query.remove_plugin_class(qparser.WildcardPlugin) query.add_plugin(qparser.PrefixPlugin()) query.add_plugin(qparser.OperatorsPlugin) query.add_plugin(qparser.RegexPlugin) query.add_plugin(qparser.PhrasePlugin) # 解析得到查询器 q = query.parse(query_sentence) logging.info("Query parse result: %s" % str(q)) print(q) # 获取查询结果 result = searcher.search(q, limit=20) # print(result) # 设置碎片的属性 # Allow larger fragments my_cf = highlight.ContextFragmenter(maxchars=200, surround=30) hf = highlight.HtmlFormatter(tagname='em', classname='match', termclass='term') hi = highlight.Highlighter(fragmenter=my_cf, formatter=hf) for hit in result: print(hit["picpath"]) print(hit["title"]) print(escape(hi.highlight_hit(hit, "content"))) if hit['picpath'] == '#': if highlight_xy: res.append({ "title": hit['title'], "xueyuan": Markup(hi.highlight_hit(hit, "xueyuan")), "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": '#', "pagerank": scores[url_dict[hit["url"]]] }) else: res.append({ "title": hit['title'], "xueyuan": hit["xueyuan"], "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": '#', "pagerank": scores[url_dict[hit["url"]]] }) else: if highlight_xy: res.append({ "title": hit['title'], "xueyuan": Markup(hi.highlight_hit(hit, "xueyuan")), "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": "images/%s/%s" % (hit['picpath'].split('/')[-3], hit['picpath'].split('/')[-1]), "pagerank": scores[url_dict[hit["url"]]] }) else: res.append({ "title": hit['title'], "xueyuan": hit["xueyuan"], "url": hit["url"], 'shotpath': hit['shotpath'], "content": Markup(hi.highlight_hit(hit, "content")), "parenturl": hit["parenturl"], "picpath": "images/%s/%s" % (hit['picpath'].split('/')[-3], hit['picpath'].split('/')[-1]), "pagerank": scores[url_dict[hit["url"]]] }) print(len(result)) print(res) count = len(result) if count == 0: logging.warning("%d,没有查询到相关内容!" % 404) return "没有查询到相关内容!", 404 else: # 记录查询日志 log = "Response: " for item in res: log = log + " (name:%s,url:%s) " % (item["title"], item["url"]) logging.info(log) # # 基于page rank 对链接进行排序 # res.sort(key=lambda k:(k.get("pagerank",0)),reverse = True) # print(res) mysession["data"] = res # 使用会话session传递参数 return jsonify({"url": "/display/%d&%s" % (count, query_sentence)})
from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import jaccard_similarity_score from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh import index, searching from whoosh import qparser from whoosh import highlight path1 = "documents" path = "docs" schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) ix = index.create_in(path, schema) writer = ix.writer() dir = os.listdir(path1) st = set(stopwords.words('english')) hf = highlight.HtmlFormatter() def queryformation(qstring, content): query_content = [] i = "" wd = word_tokenize(qstring) for w in wd: if w not in st: i = i + " " + w query_content.append(i) X = CountVectorizer().fit_transform(query_content) t = TfidfTransformer(smooth_idf=False).fit_transform(X) te = [] for i in t: te.append(list(i.A[0]))
class WhooshTRECNewsEngine(SearchEngine): """Whoosh Query log search engine.""" def __init__(self, service, whoosh_trec_news_index_dir=""): super(WhooshTRECNewsEngine, self).__init__(service) try: self.docIndex = open_dir(whoosh_trec_news_index_dir) print "Whoosh Document index open" print self.docIndex.doc_count() except: print "Could not open Whoosh Document index at: " + whoosh_trec_news_index_dir def search(self, query, pos=0): """ Search service for query log data held in a Whoosh TREC News Document index with a Schema() Parameters: * query (puppy.model.Query) Returns: * results puppy.model.Response Raises: * ? """ def parse_whoosh_trec(site, query, results): response = Response() response.version = 'trec' response.feed.setdefault('title', "{0}: {1}".format(site, query)) response.feed.setdefault('link', '') response.feed.setdefault( 'description', "Search results for '{0}' at {1}".format(query, site)) response.namespaces.setdefault( "opensearch", "http://a9.com/-/spec/opensearch/1.1/") response.feed.setdefault("opensearch_totalresults", results.pagecount) response.feed.setdefault("opensearch_itemsperpage", pagelen) response.feed.setdefault("opensearch_startindex", results.pagenum) response.feed.setdefault('query', query) try: r = 0 if len(results) > 1: for hit in results: r = r + 1 title = hit["title"] title = title.strip() if len(title) < 1: title = query rank = ( (int(results.pagenum) - 1) * results.pagelen) + r link = "/treconomics/" + str( hit.docnum) + "?rank=" + str(rank) desc = hit.highlights("content") docid = hit["docid"] docid = docid.strip() source = hit["source"] response.entries.append({ 'title': title, 'link': link, 'summary': desc, 'docid': docid, 'source': source }) else: print "No hits found for query: " + query except Exception, e: print "Converting results to OpenSearch Failed" return response # end parse_whoosh_trec try: parser = QueryParser("content", self.docIndex.schema) #mparser = MultifieldParser(["title", "content"], schema=self.docIndex.schema) print "In WhooshTRECNewsEngine: " + query.search_terms query_terms = parser.parse(query.search_terms) page = query.start_page pagelen = query.page_len #print query_terms #print "page len" + str(pagelen) results = [] reponse = [] with self.docIndex.searcher() as searcher: results = searcher.search_page(query_terms, page, pagelen=pagelen) # results = searcher.search( query_terms ) results.fragmenter = highlight.ContextFragmenter(maxchars=300, surround=300) results.formatter = highlight.HtmlFormatter() results.fragmenter.charlimit = 100000 print "WhooshTRECNewsEngine found: " + str( len(results)) + " results" print "Page %d of %d - PageLength of %d" % ( results.pagenum, results.pagecount, results.pagelen) response = parse_whoosh_trec('WhooshTRECNewsEngine', query.search_terms, results) return response except: print "Error in Search Service: Whoosh TREC News search failed"