def test_null_fragment(): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() nf = highlight.WholeFragmenter() uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, nf, uc) assert htext == "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima"
def search(self): c.terms = request.GET.get('terms', '') c.results = [] if len(c.terms) < 4: h.flash( _('Search queries must be at least 4 characters in length.'), 'error' ) redirect(url(controller='blog', action='index')) query = MultifieldParser( ['title', 'content', 'summary'], schema=index.schema ).parse(c.terms) results = index.searcher().search(query, limit=10) for result in results: terms = [v for k, v in query.all_terms() if k == 'content'] url_kwargs = json.loads(result['url']) result['url'] = url(**url_kwargs) result['highlights'] = highlight( result['content'], terms, search.schema['content'].format.analyzer, ContextFragmenter(terms), HtmlFormatter(tagname='span', classname='highlight') ) c.results.append(result) return render('search.tpl', slacks=True)
def run(self): keywords = self._keywords.replace('+',' ') keywords = ' '.join(filter(lambda x:x.find('v:') != 0 or x[0] == '~', keywords.split())) termset = [] for term in keywords.split(): for token in term.split('|'): if len(token.strip()) > 0: termset.append(utils.ConvertToThaiSearch(token, True)) if hasattr(self._delegate, 'DisplayWillStart'): wx.CallAfter(self._delegate.DisplayWillStart) items = [] key = '%d:%d' % (self._mark) if hasattr(self._delegate, 'HasDisplayResult') and not self._delegate.HasDisplayResult(key): for idx, result in enumerate(self._results[self._mark[0]:self._mark[1]]): excerpts = highlight(result['content'], termset, NgramTokenizer(min([len(t) for t in termset]), max([len(t) for t in termset])), SimpleFragmenter(size=70), MyHtmlFormatter(tagname='font', attrs='size="4" color="purple"')) items.append(self.ProcessResult(idx, result, self.ProcessExcerpts(excerpts))) if hasattr(self._delegate, 'DisplayDidProgress'): wx.CallAfter(self._delegate.DisplayDidProgress, (idx+1)) if hasattr(self._delegate, 'SaveDisplayResult'): self._delegate.SaveDisplayResult(items, key) if hasattr(self._delegate, 'DisplayDidFinish'): wx.CallAfter(self._delegate.DisplayDidFinish, key, self._current)
def resultExcerpt(self, results, i, ki=None): # FIXME: this should not be implementation specific if not ki: r=results[i] name=r['kitab'] v=r['vrr'].split('-')[0] m=self.th.getMeta().getLatestKitabV(name,v) ki=self.th.getCachedKitab(m['uri']) num=int(results[i]['nodeIdNum']) node=ki.getNodeByIdNum(num) n=ki.toc.next(node) if n: ub=n.globalOrder else: ub=-1 txt=node.toText(ub) s=set() #results.query.all_terms(s) # return (field,term) pairs results.q.existing_terms(self.indexer.reader(), s, phrases=True) # return (field,term) pairs # self.self.__ix_searcher.reader() terms=dict( map(lambda i: (i[1],i[0]), filter(lambda j: j[0]=='content' or j[0]=='title', s))).keys() #print "txt=[%s]" % len(txt) snippet=txt[:min(len(txt),512)] # dummy summary snippet=highlight(txt, terms, analyzer, SentenceFragmenter(sentencechars = ".!?؟\n"), HtmlFormatter(between=u"\u2026\n"), top=3, scorer=BasicFragmentScorer, minscore=1, order=FIRST) #snippet=highlight(txt, terms, analyzer, # SentenceFragmenter(sentencechars = ".!?"), ExcerptFormatter(between = u"\u2026\n"), top=3, # scorer=BasicFragmentScorer, minscore=1, # order=FIRST) return snippet
def test_context_fragment(): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, cf, uc) assert htext == "alfa BRAVO charlie...hotel INDIA juliet"
def run(self): termset = [] keywords = self.keywords.replace('+',' ').replace('|',' ') keywords = ' '.join(filter(lambda x:x.find('v:') != 0, keywords.split())) for t in keywords.split(): termset.append(t) items = [] wx.CallAfter(self.window.DisplayStarted) key = '%d:%d'%self.p if key not in dataModel: for i,r in enumerate(self.results[self.p[0]:self.p[1]]): nMin = min([len(t) for t in termset]) nMax = max([len(t) for t in termset]) excerpts = highlight(r['content'], termset,NgramTokenizer(nMin,nMax), SimpleFragmenter(size=70), MyHtmlFormatter(tagname='font',attrs='size="4" color="purple"')) if self.lang == 'pali' and 'wxMac' not in wx.PlatformInfo: excerpts = excerpts.replace(u'ฐ',u'\uf700').replace(u'ญ',u'\uf70f').replace(u'\u0e4d',u'\uf711') if self.lang != 'thaibt': items.append((self.p[0]+i+1,r['volume'].lstrip(u'0'),r['page'].lstrip(u'0'),r['items'],excerpts)) else: items.append((self.p[0]+i+1, unicode(r['volume']), unicode(r['page']), u'0', excerpts)) wx.CallAfter(self.window.UpdateProgress, (i+1)*10) dataModel[key] = items wx.CallAfter(self.window.DisplayFinished)
def test_maxclasses(self): terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(terms, surround=6) hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2) htext = highlight.highlight(self._doc, terms, sa, cf, hf) self.assertEqual(htext, '<b class="match t0">alfa</b> <b class="match t1">bravo</b> <b class="match t0">charlie</b>...<b class="match t1">delta</b> <b class="match t0">echo</b> foxtrot')
def test_maxclasses(): terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2) htext = highlight.highlight(_doc, terms, sa, cf, hf) assert htext == '<b class="match t0">alfa</b> <b class="match t1">bravo</b> <b class="match t0">charlie</b>...<b class="match t1">delta</b> <b class="match t0">echo</b> foxtrot'
def test_null_fragment(self): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() nf = highlight.NullFragmenter uc = highlight.UppercaseFormatter() htext = highlight.highlight(self._doc, terms, sa, nf, uc) self.assertEqual(htext, "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima")
def run(self): termset = [] keywords = self.keywords.replace('+',' ') #if self.lang == 'thai' or self.lang == 'thaimm' or self.lang == 'thaiwn' or self.lang == 'thaimc': # for t in self.segmenter(keywords): # termset.append(t.text) #elif self.lang == 'pali': for t in keywords.split(): termset.append(t) tmp = [] wx.CallAfter(self.window.DisplayStarted) key = '%d:%d'%self.p if key not in dataModel: for i,r in enumerate(self.results[self.p[0]:self.p[1]]): nMin = min([len(t) for t in termset]) nMax = max([len(t) for t in termset]) excerpts = highlight(r['content'], termset,NgramTokenizer(nMin,nMax), SimpleFragmenter(size=70), MyHtmlFormatter(tagname='font',attrs='size="4" color="purple"')) if self.lang == 'pali' and 'wxMac' not in wx.PlatformInfo: excerpts = excerpts.replace(u'ฐ',u'\uf700').replace(u'ญ',u'\uf70f').replace(u'\u0e4d',u'\uf711') tmp.append((self.p[0]+i+1,r['volumn'].lstrip(u'0'),r['page'].lstrip(u'0'),r['items'],excerpts)) wx.CallAfter(self.window.UpdateProgress, (i+1)*10) dataModel[key] = tmp wx.CallAfter(self.window.DisplayFinished)
def test_context_fragment(self): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(terms, surround=6) uc = highlight.UppercaseFormatter() htext = highlight.highlight(self._doc, terms, sa, cf, uc) self.assertEqual(htext, "alfa BRAVO charlie...hotel INDIA juliet")
def test_issue324(): sa = analysis.StemmingAnalyzer() result = highlight.highlight(u("Indexed!\n1"), [u("index")], sa, fragmenter=highlight.ContextFragmenter(), formatter=highlight.UppercaseFormatter()) assert result == "INDEXED!\n1"
def test_simple_fragment(self): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() sf = highlight.SimpleFragmenter(size=20) uc = highlight.UppercaseFormatter() htext = highlight.highlight(self._doc, terms, sa, sf, uc) self.assertEqual(htext, "alfa BRAVO charlie...hotel INDIA juliet kilo")
def test_html_format(self): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(terms, surround=6) hf = highlight.HtmlFormatter() htext = highlight.highlight(self._doc, terms, sa, cf, hf) self.assertEqual(htext, 'alfa <strong class="match term0">bravo</strong> charlie...hotel <strong class="match term1">india</strong> juliet')
def test_context_at_start(): terms = frozenset(["alfa"]) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=15) uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, cf, uc) assert htext == "ALFA bravo charlie delta echo foxtrot"
def test_html_format(): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter() htext = highlight.highlight(_doc, terms, sa, cf, hf) assert htext == 'alfa <strong class="match term0">bravo</strong> charlie...hotel <strong class="match term1">india</strong> juliet'
def _process_results(self, raw_results, highlight=False, query_string=''): from haystack import site results = [] hits = len(raw_results) facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_results): raw_result = dict(raw_result) app_label, model_name = raw_result['django_ct'].split('.') additional_fields = {} for key, value in raw_result.items(): additional_fields[str(key)] = self._to_python(value) del(additional_fields['django_ct']) del(additional_fields['django_id']) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] # DRL_FIXME: Highlighting doesn't seem to work properly in testing. additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } # Requires Whoosh 0.1.20+. if hasattr(raw_results, 'score'): score = raw_results.score(doc_offset) else: score = None if score is None: score = 0 model = get_model(app_label, model_name) if model: if model in indexed_models: result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields) results.append(result) else: hits -= 1 else: hits -= 1 if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def test_html_escape(): terms = frozenset(["bravo"]) sa = analysis.StandardAnalyzer() wf = highlight.WholeFragmenter() hf = highlight.HtmlFormatter() htext = highlight.highlight(u('alfa <bravo "charlie"> delta'), terms, sa, wf, hf) assert htext == 'alfa <<strong class="match term0">bravo</strong> "charlie"> delta'
def test_sentence_fragment(self): text = u"This is the first sentence. This one doesn't have the word. This sentence is the second. Third sentence here." terms = ("sentence", ) sa = analysis.StandardAnalyzer(stoplist=None) sf = highlight.SentenceFragmenter() uc = highlight.UppercaseFormatter() htext = highlight.highlight(text, terms, sa, sf, uc) self.assertEqual(htext, "This is the first SENTENCE...This SENTENCE is the second...Third SENTENCE here")
def get_highlighted_summary(self, summary, query, analyzer, frag, format): summary = unicode(summary.replace("\n", " ")) if len(summary) > 350: summary = unicode(summary.replace("\n", " "))[0:350] + "..." hl = highlight(summary, query, analyzer, frag, format) if hl: return hl else: return summary
def get_highlighted_summary(self,summary,query, analyzer,frag,format): summary = unicode(summary.replace('\n', ' ')) if len(summary) > 350: summary = unicode(summary.replace('\n', ' '))[0:350]+'...' hl = highlight(summary,query,analyzer,frag,format) if hl: return hl else: return summary
def test_sentence_fragment(): text = u("This is the first sentence. This one doesn't have the word. " + "This sentence is the second. Third sentence here.") terms = ("sentence", ) sa = analysis.StandardAnalyzer(stoplist=None) sf = highlight.SentenceFragmenter() uc = highlight.UppercaseFormatter() htext = highlight.highlight(text, terms, sa, sf, uc) assert htext == "This is the first SENTENCE...This SENTENCE is the second...Third SENTENCE here"
def test_null_fragment(self): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() nf = highlight.NullFragmenter uc = highlight.UppercaseFormatter() htext = highlight.highlight(self._doc, terms, sa, nf, uc) self.assertEqual( htext, "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima" )
def highlight(self, content, top=5): if self.search_type != 'content': return '' hl = highlight(escape(content), self.highlight_items, analyzer=ANALYZER, fragmenter=FRAGMENTER, formatter=FORMATTER, top=top) return hl
def highlight(self, content, top=5): if self.search_type != 'content': return '' hl = highlight(text=escape(content), terms=self.highlight_items, analyzer=ANALYZER, fragmenter=FRAGMENTER, formatter=FORMATTER, top=top) return hl
def highlight(self, content, top=5): if self.search_type not in ['content', 'message']: return '' hl = highlight(text=content, terms=self.highlight_items, analyzer=ANALYZER, fragmenter=FRAGMENTER, formatter=FORMATTER, top=top) return hl
def get_object(self): indexer = Indexer('file') query = self.request.resolver_match.kwargs['query'] docs = indexer.get_doc(url=self.request.resolver_match.kwargs['url']) if not len(docs): return {} query_list = query.split(' ') excerpts = highlight(docs[0]['body'], set(query_list), StandardAnalyzer(), WholeFragmenter(), HtmlFormatter()) return {'body': excerpts, 'title': docs[0]['title']}
def highlight(self, content, top=5): if self.search_type not in ['content', 'message']: return '' hl = highlight( text=content, terms=self.highlight_items, analyzer=ANALYZER, fragmenter=FRAGMENTER, formatter=FORMATTER, top=top ) return hl
def test_workflow_manual(): schema = fields.Schema(id=fields.ID(stored=True), title=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), title=u("The man who wasn't there")) w.add_document(id=u("2"), title=u("The dog who barked at midnight")) w.add_document(id=u("3"), title=u("The invisible man")) w.add_document(id=u("4"), title=u("The girl with the dragon tattoo")) w.add_document(id=u("5"), title=u("The woman who disappeared")) w.commit() with ix.searcher() as s: # Parse the user query parser = qparser.QueryParser("title", schema=ix.schema) q = parser.parse(u("man")) # Extract the terms the user used in the field we're interested in terms = [ text for fieldname, text in q.all_terms() if fieldname == "title" ] # Perform the search r = s.search(q) assert_equal(len(r), 2) # Use the same analyzer as the field uses. To be sure, you can # do schema[fieldname].analyzer. Be careful not to do this # on non-text field types such as DATETIME. analyzer = schema["title"].analyzer # Since we want to highlight the full title, not extract fragments, # we'll use WholeFragmenter. nf = highlight.WholeFragmenter() # In this example we'll simply uppercase the matched terms fmt = highlight.UppercaseFormatter() outputs = [] for d in r: text = d["title"] outputs.append(highlight.highlight(text, terms, analyzer, nf, fmt)) assert_equal(outputs, ["The invisible MAN", "The MAN who wasn't there"])
def resultExcerpt(self, results, i, ki=None): # FIXME: this should not be implementation specific if not ki: r = results[i] name = r['kitab'] v = r['vrr'].split('-')[0] m = self.th.getMeta().getLatestKitabV(name, v) ki = self.th.getCachedKitab(m['uri']) num = int(results[i]['nodeIdNum']) node = ki.getNodeByIdNum(num) n = ki.toc.next(node) if n: ub = n.globalOrder else: ub = -1 txt = node.toText(ub) s = set() #results.query.all_terms(s) # return (field,term) pairs # return (field,term) pairs # self.self.__ix_searcher.reader() s = results.q.existing_terms(self.indexer.reader(), phrases=True) #s = set([i.decode('utf_8') for i in s]) terms = dict( map(lambda i: (i[1], i[0]), filter(lambda j: j[0] == 'content' or j[0] == 'title', s))).keys() #print "txt = [%s]" % len(txt) terms = [i.decode('utf_8') for i in terms] snippet_dummy = txt[:min(len(txt), 512)] # dummy summary snippet = highlight(txt, terms, analyzer, SentenceFragmenter(sentencechars=".!?؟\n"), HtmlFormatter(between=u"\u2026\n"), top=3, scorer=BasicFragmentScorer, minscore=1, order=FIRST) #snippet = highlight(txt, terms, analyzer, # SentenceFragmenter(sentencechars = ".!?"), ExcerptFormatter(between = u"\u2026\n"), top = 3, # scorer = BasicFragmentScorer, minscore = 1, # order = FIRST) print snippet if len(snippet) > 1: return snippet else: return snippet_dummy
def test_workflow_manual(): schema = fields.Schema(id=fields.ID(stored=True), title=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), title=u("The man who wasn't there")) w.add_document(id=u("2"), title=u("The dog who barked at midnight")) w.add_document(id=u("3"), title=u("The invisible man")) w.add_document(id=u("4"), title=u("The girl with the dragon tattoo")) w.add_document(id=u("5"), title=u("The woman who disappeared")) w.commit() with ix.searcher() as s: # Parse the user query parser = qparser.QueryParser("title", schema=ix.schema) q = parser.parse(u("man")) # Extract the terms the user used in the field we're interested in terms = [text for fieldname, text in q.all_terms() if fieldname == "title"] # Perform the search r = s.search(q) assert len(r) == 2 # Use the same analyzer as the field uses. To be sure, you can # do schema[fieldname].analyzer. Be careful not to do this # on non-text field types such as DATETIME. analyzer = schema["title"].analyzer # Since we want to highlight the full title, not extract fragments, # we'll use WholeFragmenter. nf = highlight.WholeFragmenter() # In this example we'll simply uppercase the matched terms fmt = highlight.UppercaseFormatter() outputs = [] for d in r: text = d["title"] outputs.append(highlight.highlight(text, terms, analyzer, nf, fmt)) assert outputs == ["The invisible MAN", "The MAN who wasn't there"]
def searchBodyAndHighlight(q): parser = SimpleParser("body", schema=ix.schema) q = parser.parse(q) terms = [text for fieldname, text in q.all_terms() if fieldname == "body"] r = s.search(q) analyzer = schema["body"].format.analyzer print "will tokenize with",q.all_terms fragmenter = highlight.ContextFragmenter(q.all_terms,400,80) # formatter = highlight.HtmlFormatter() formatter = colorIpythonFormatter for d in r: # The text argument to highlight is the stored text of the title text = d["body"] res= highlight.highlight(text, terms, analyzer,fragmenter, formatter) # print res.encode("latin-1","replace") print unicodedata.normalize('NFKC', res).encode("utf-8","replace") print "-"*8
def run(self): termset = [] keywords = self.keywords.replace('+', ' ').replace('|', ' ') keywords = ' '.join( filter(lambda x: x.find('v:') != 0, keywords.split())) for t in keywords.split(): termset.append(t) items = [] wx.CallAfter(self.window.DisplayStarted) key = '%d:%d' % self.p if key not in dataModel: for i, r in enumerate(self.results[self.p[0]:self.p[1]]): nMin = min([len(t) for t in termset]) nMax = max([len(t) for t in termset]) excerpts = highlight( r['content'], termset, NgramTokenizer(nMin, nMax), SimpleFragmenter(size=70), MyHtmlFormatter(tagname='font', attrs='size="4" color="purple"')) if self.lang == 'pali' and 'wxMac' not in wx.PlatformInfo: excerpts = excerpts.replace(u'ฐ', u'\uf700').replace( u'ญ', u'\uf70f').replace(u'\u0e4d', u'\uf711') if self.lang != 'thaibt': items.append( (self.p[0] + i + 1, r['volume'].lstrip(u'0'), r['page'].lstrip(u'0'), r['items'], excerpts)) else: items.append((self.p[0] + i + 1, unicode(r['volume']), unicode(r['page']), u'0', excerpts)) wx.CallAfter(self.window.UpdateProgress, (i + 1) * 10) dataModel[key] = items wx.CallAfter(self.window.DisplayFinished)
def _process_results(self, raw_results, highlight=False, query_string=''): results = [] facets = {} for raw_result in raw_results: raw_result = dict(raw_result) app_label, module_name = raw_result['django_ct_s'].split('.') additional_fields = {} for key, value in raw_result.items(): additional_fields[str(key)] = self._to_python(value) del(additional_fields['django_ct_s']) del(additional_fields['django_id_s']) # DRL_FIXME: Figure out if there's a way to get the score out of Whoosh. # del(additional_fields['score']) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] # DRL_FIXME: Highlighting doesn't seem to work properly in testing. additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } result = SearchResult(app_label, module_name, raw_result['django_id_s'], raw_result.get('score', 0), **additional_fields) results.append(result) return { 'results': results, 'hits': len(results), 'facets': facets, }
def _process_results(self, raw_page, highlight=False, query_string="", spelling_query=None, result_class=None): from haystack import site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split(".") additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], "convert"): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(",") else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace("*", "") for term in query_string.split()] additional_fields["highlighted"] = { self.content_field_name: [ highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter(), ) ] } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, "HAYSTACK_INCLUDE_SPELLING", False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return {"results": results, "hits": hits, "facets": facets, "spelling_suggestion": spelling_suggestion}
analyzer = schema["title"].format.analyzer # Since we want to highlight the full title, not extract fragments, # we'll use NullFragmenter. See the docs for the highlight module # for which fragmenters are available. fragmenter = highlight.NullFragmenter # This object controls what the highlighted output looks like. # See the docs for its arguments. formatter = highlight.HtmlFormatter() for d in r: # The text argument to highlight is the stored text of the title text = d["title"] print highlight.highlight(text, terms, analyzer, fragmenter, formatter) #do it on the body now def colorIpythonFormatter(text,fragments): resultFrags=[] addedChars=0 #each highligh add 11 chars for f in fragments: for tok in f.matches: text=text[:tok.startchar+addedChars]+tc.Red+text[tok.startchar+addedChars:tok.endchar+addedChars]+tc.Normal+text[tok.endchar+addedChars:] addedChars+=11 resultFrags.append(text[f.startchar+addedChars:f.endchar+15+addedChars]) return " [...] ".join(resultFrags) def colorIpythonFormatter(text,fragments): resultFrags=[] addedChars=0 #each highligh add 11 chars
for doc_offset, raw_result in enumerate(raw_results): raw_result = dict(raw_result) final_result = {} for key, value in raw_result.items(): final_result[str(key)] = self._to_python(value) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] # DRL_FIXME: Highlighting doesn't seem to work properly in testing. results['highlighted'][self.content_field_name] = [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())] # Requires Whoosh 0.1.20+. if hasattr(raw_results, 'score'): final_result['score'] = raw_results.score(doc_offset) else: final_result['score'] = 0 results['docs'].append(final_result) if self.include_spelling: results['spelling_suggestion'] = self.create_spelling_suggestion(query_string) # DRL_FIXME: This needs to be corrected. results['hits'] = len(results['docs']) return results
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields[DJANGO_CT]) del(additional_fields[DJANGO_ID]) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ',') else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [ term.replace('*', '') for term in query_string.split() ] additional_fields['highlighted'] = { self.content_field_name: [ highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter()) ], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def highlight(self, text, words): fragmenter = ContextFragmenter() formatter = HtmlFormatter() analyzer = self.project_schema['text'].analyzer return highlight(text, words, analyzer, fragmenter, formatter, top=1)
def run_query(query, index): """ Queries the index for data with the given text query @param query The text query to perform on the indexed data @return A list of HTMl string snippets to return """ # Create a searcher object for this index searcher = index.searcher() # Create a query parser that will parse multiple fields of the documents field_boosts = {'content': 1.0, 'title': 3.0} query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup) # Build a query object from the query string query_object = query_parser.parse(query) # Build a spell checker in this index and add the "content" field to the spell checker spell_checker = SpellChecker(index.storage) spell_checker.add_field(index, 'content') spell_checker.add_field(index, 'title') # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results search_terms = [text for fieldname, text in query_object.all_terms()] # Remove terms that are too short for search_term in search_terms: if len(search_term) <= 3: search_terms.remove(search_term) # Perform the query itself search_results = searcher.search(query_object) # Get an analyzer for analyzing the content of each page for highlighting analyzer = index_schema['content'].format.analyzer # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts # by 'context' in the content fragmenter = ContextFragmenter(frozenset(search_terms)) # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to # highlight the results formatter = HtmlFormatter() # Iterate through the search results, highlighting and counting the results result_count = 0 results = [] for search_result in search_results: # Collect this search result results.append({ 'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter), 'url': search_result['url'], 'title': search_result['title'] }) result_count += 1 # Build a list of 'suggest' words using the spell checker suggestions = [] for term in search_terms: suggestions.append(spell_checker.suggest(term)) # Return the list of web pages along with the terms used in the search return results, search_terms, suggestions, result_count
def run_query(query, index): """ Queries the index for data with the given text query @param query The text query to perform on the indexed data @return A list of HTMl string snippets to return """ # Create a searcher object for this index searcher = index.searcher() # Create a query parser that will parse multiple fields of the documents field_boosts = { 'content': 1.0, 'title': 3.0 } query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup) # Build a query object from the query string query_object = query_parser.parse(query) # Build a spell checker in this index and add the "content" field to the spell checker spell_checker = SpellChecker(index.storage) spell_checker.add_field(index, 'content') spell_checker.add_field(index, 'title') # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results search_terms = [text for fieldname, text in query_object.all_terms()] # Remove terms that are too short for search_term in search_terms: if len(search_term) <= 3: search_terms.remove(search_term) # Perform the query itself search_results = searcher.search(query_object) # Get an analyzer for analyzing the content of each page for highlighting analyzer = index_schema['content'].format.analyzer # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts # by 'context' in the content fragmenter = ContextFragmenter(frozenset(search_terms)) # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to # highlight the results formatter = HtmlFormatter() # Iterate through the search results, highlighting and counting the results result_count = 0 results = [] for search_result in search_results: # Collect this search result results.append({ 'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter), 'url': search_result['url'], 'title': search_result['title'] }) result_count += 1 # Build a list of 'suggest' words using the spell checker suggestions = [] for term in search_terms: suggestions.append(spell_checker.suggest(term)) # Return the list of web pages along with the terms used in the search return results, search_terms, suggestions, result_count
def parse_response(reader, fieldname, analyzer, fragmenter, formatter, query, results, results_are_page=False): """ Returns an ifind Response, given a query and set of results from Whoosh/Redis. Takes an ifind Query object and a list of SORTED results for the given query. If the page requested (query.skip) is < 0, page 1 is returned. If the page requested is greater than the number of available pages, the last page is returned. """ def get_term_list(): if isinstance(query.parsed_terms, unicode): return [query.parsed_terms] return [ text for term_fieldname, text in query.parsed_terms.all_terms() if fieldname == fieldname ] response = Response(query.terms) response.results_total = len(results) if results_are_page: page = results[0] response.total_pages = results[1] results = results[2] else: page, response.total_pages, results = get_page(query, results) page_len = query.top i = 0 for result in results: i = i + 1 rank = (page - 1) * page_len + i whoosh_docnum = result[0] score = result[1] stored_data = reader.stored_fields(whoosh_docnum) title = stored_data['title'] if title: title = title.strip() else: title = "Untitled Document" url = "/treconomics/{0}/".format(whoosh_docnum) trecid = stored_data['docid'].strip() source = stored_data['source'].strip() summary = highlight(stored_data['content'], get_term_list(), analyzer, fragmenter, formatter) summary = "{0}...".format(summary) response.add_result(title=title, url=url, summary=summary, docid=trecid, source=source, rank=rank, whooshid=whoosh_docnum, score=score) # The following two lines are for compatibility purposes with the existing codebase. # Would really like to take these out. setattr(response, 'results_on_page', len(results)) setattr(response, 'actual_page', page) return response
def _process_results( self, raw_results, start_offset, end_offset, highlight=False, query_string="", spelling_query=None ): from haystack import site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_results) raw_results = raw_results[start_offset:end_offset] facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_results): raw_result = dict(raw_result) app_label, model_name = raw_result["django_ct"].split(".") additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], "convert"): additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields["django_ct"]) del (additional_fields["django_id"]) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace("*", "") for term in query_string.split()] # DRL_FIXME: Highlighting doesn't seem to work properly in testing. additional_fields["highlighted"] = { self.content_field_name: [ highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter(), ) ] } # Requires Whoosh 0.1.20+. if hasattr(raw_results, "score"): score = raw_results.score(doc_offset) else: score = None if score is None: score = 0 result = SearchResult(app_label, model_name, raw_result["django_id"], score, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, "HAYSTACK_INCLUDE_SPELLING", False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return {"results": results, "hits": hits, "facets": facets, "spelling_suggestion": spelling_suggestion}
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None): from haystack import site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result['django_ct'].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if isinstance(index.fields[string_key], MultiValueField): if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields['django_ct']) del(additional_fields['django_id']) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def parse_response(reader, fieldname, analyzer, fragmenter, formatter, query, results, results_are_page=False): """ Returns an ifind Response, given a query and set of results from Whoosh/Redis. Takes an ifind Query object and a list of SORTED results for the given query. If the page requested (query.skip) is < 0, page 1 is returned. If the page requested is greater than the number of available pages, the last page is returned. """ def get_term_list(): if isinstance(query.parsed_terms, unicode): return [query.parsed_terms] return [text for term_fieldname, text in query.parsed_terms.all_terms() if fieldname == fieldname] response = Response(query.terms) response.results_total = len(results) if results_are_page: page = results[0] response.total_pages = results[1] results = results[2] else: page, response.total_pages, results = get_page(query, results) page_len = query.top i = 0 for result in results: i = i + 1 rank = (page - 1) * page_len + i whoosh_docnum = result[0] score = result[1] stored_data = reader.stored_fields(whoosh_docnum) title = stored_data["title"] if title: title = title.strip() else: title = "Untitled Document" url = "/treconomics/{0}/".format(whoosh_docnum) trecid = stored_data["docid"].strip() source = stored_data["source"].strip() summary = highlight(stored_data["content"], get_term_list(), analyzer, fragmenter, formatter) summary = "{0}...".format(summary) response.add_result( title=title, url=url, summary=summary, docid=trecid, source=source, rank=rank, whooshid=whoosh_docnum, score=score, ) # The following two lines are for compatibility purposes with the existing codebase. # Would really like to take these out. setattr(response, "results_on_page", len(results)) setattr(response, "actual_page", page) return response