Exemplos de highlight em Python, exemplos de whoosh.highlight.highlight em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: JunjieHu/dl

def test_null_fragment():
    terms = frozenset(("bravo", "india"))
    sa = analysis.StandardAnalyzer()
    nf = highlight.WholeFragmenter()
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(_doc, terms, sa, nf, uc)
    assert htext == "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima"

Exemplo n.º 2

0

Exibir arquivo

    def search(self):
        c.terms = request.GET.get('terms', '')
        c.results = []
        if len(c.terms) < 4:
            h.flash(
                _('Search queries must be at least 4 characters in length.'),
                'error'
            )
            redirect(url(controller='blog', action='index'))

        query = MultifieldParser(
            ['title', 'content', 'summary'],
            schema=index.schema
        ).parse(c.terms)
        results = index.searcher().search(query, limit=10)
        for result in results:
            terms = [v for k, v in query.all_terms() if k == 'content']
            url_kwargs = json.loads(result['url'])
            result['url'] = url(**url_kwargs)
            result['highlights'] = highlight(
                result['content'],
                terms,
                search.schema['content'].format.analyzer,
                ContextFragmenter(terms),
                HtmlFormatter(tagname='span', classname='highlight')
            )
            c.results.append(result)
        return render('search.tpl', slacks=True)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: __init__.py Projeto: ssutee/e-tipitaka-pc

    def run(self):
        keywords = self._keywords.replace('+',' ')
        keywords = ' '.join(filter(lambda x:x.find('v:') != 0 or x[0] == '~', keywords.split()))

        termset = []
        for term in keywords.split():
            for token in term.split('|'):
                if len(token.strip()) > 0:
                    termset.append(utils.ConvertToThaiSearch(token, True))
        
        if hasattr(self._delegate, 'DisplayWillStart'):
            wx.CallAfter(self._delegate.DisplayWillStart)
        
        items = []
        key = '%d:%d' % (self._mark)        
        if hasattr(self._delegate, 'HasDisplayResult') and not self._delegate.HasDisplayResult(key):            
            for idx, result in enumerate(self._results[self._mark[0]:self._mark[1]]):
                excerpts = highlight(result['content'], termset,
                    NgramTokenizer(min([len(t) for t in termset]), max([len(t) for t in termset])),
                    SimpleFragmenter(size=70), MyHtmlFormatter(tagname='font', attrs='size="4" color="purple"'))
                items.append(self.ProcessResult(idx, result, self.ProcessExcerpts(excerpts)))                            

                if hasattr(self._delegate, 'DisplayDidProgress'):
                    wx.CallAfter(self._delegate.DisplayDidProgress, (idx+1))                        

            if hasattr(self._delegate, 'SaveDisplayResult'):
                self._delegate.SaveDisplayResult(items, key)    

        if hasattr(self._delegate, 'DisplayDidFinish'):
            wx.CallAfter(self._delegate.DisplayDidFinish, key, self._current)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: whooshSearchEngine.py Projeto: amzouri/Thawab

 def resultExcerpt(self, results, i, ki=None):
   # FIXME: this should not be implementation specific
   if not ki:
     r=results[i]
     name=r['kitab']
     v=r['vrr'].split('-')[0]
     m=self.th.getMeta().getLatestKitabV(name,v)
     ki=self.th.getCachedKitab(m['uri'])
   num=int(results[i]['nodeIdNum'])
   node=ki.getNodeByIdNum(num)
   n=ki.toc.next(node)
   if n: ub=n.globalOrder
   else: ub=-1
   txt=node.toText(ub)
   s=set()
   #results.query.all_terms(s) # return (field,term) pairs 
   results.q.existing_terms(self.indexer.reader(), s, phrases=True) # return (field,term) pairs  # self.self.__ix_searcher.reader()
   terms=dict(
     map(lambda i: (i[1],i[0]),
     filter(lambda j: j[0]=='content' or j[0]=='title', s))).keys()
   #print "txt=[%s]" % len(txt)
   snippet=txt[:min(len(txt),512)] # dummy summary
   snippet=highlight(txt, terms, analyzer,
     SentenceFragmenter(sentencechars = ".!?؟\n"), HtmlFormatter(between=u"\u2026\n"),
     top=3, scorer=BasicFragmentScorer, minscore=1, order=FIRST)
   #snippet=highlight(txt, terms, analyzer,
   #   SentenceFragmenter(sentencechars = ".!?"), ExcerptFormatter(between = u"\u2026\n"), top=3,
   #   scorer=BasicFragmentScorer, minscore=1,
   #   order=FIRST)
   return snippet

Exemplo n.º 5

0

Exibir arquivo

def test_context_fragment():
    terms = frozenset(("bravo", "india"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, uc)
    assert htext == "alfa BRAVO charlie...hotel INDIA juliet"

Exemplo n.º 6

0

Exibir arquivo

Arquivo: mythread.py Projeto: mutita/E-Tipitaka-for-PC

    def run(self):
        termset = []
        keywords = self.keywords.replace('+',' ').replace('|',' ')
        keywords = ' '.join(filter(lambda x:x.find('v:') != 0, keywords.split()))

        for t in keywords.split():
            termset.append(t)                

        items = []
        wx.CallAfter(self.window.DisplayStarted)
        key = '%d:%d'%self.p
        if key not in dataModel:
            for i,r in enumerate(self.results[self.p[0]:self.p[1]]):
                nMin = min([len(t) for t in termset])
                nMax = max([len(t) for t in termset])
                excerpts = highlight(r['content'],
                                     termset,NgramTokenizer(nMin,nMax),
                                     SimpleFragmenter(size=70),
                                     MyHtmlFormatter(tagname='font',attrs='size="4" color="purple"'))
                
                if self.lang == 'pali' and 'wxMac' not in wx.PlatformInfo:
                    excerpts = excerpts.replace(u'ฐ',u'\uf700').replace(u'ญ',u'\uf70f').replace(u'\u0e4d',u'\uf711')
                
                if self.lang != 'thaibt':
                    items.append((self.p[0]+i+1,r['volume'].lstrip(u'0'),r['page'].lstrip(u'0'),r['items'],excerpts))
                else:
                    items.append((self.p[0]+i+1, unicode(r['volume']), unicode(r['page']), u'0', excerpts))
                    
                wx.CallAfter(self.window.UpdateProgress, (i+1)*10)
            dataModel[key] = items
        wx.CallAfter(self.window.DisplayFinished)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: SpaceAppsXploration/whoosh

 def test_maxclasses(self):
     terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo"))
     sa = analysis.StandardAnalyzer()
     cf = highlight.ContextFragmenter(terms, surround=6)
     hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2)
     htext = highlight.highlight(self._doc, terms, sa, cf, hf)
     self.assertEqual(htext, '<b class="match t0">alfa</b> <b class="match t1">bravo</b> <b class="match t0">charlie</b>...<b class="match t1">delta</b> <b class="match t0">echo</b> foxtrot')

Exemplo n.º 8

0

Exibir arquivo

def test_maxclasses():
    terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2)
    htext = highlight.highlight(_doc, terms, sa, cf, hf)
    assert htext == '<b class="match t0">alfa</b> <b class="match t1">bravo</b> <b class="match t0">charlie</b>...<b class="match t1">delta</b> <b class="match t0">echo</b> foxtrot'

Exemplo n.º 9

0

Exibir arquivo

def test_null_fragment():
    terms = frozenset(("bravo", "india"))
    sa = analysis.StandardAnalyzer()
    nf = highlight.WholeFragmenter()
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(_doc, terms, sa, nf, uc)
    assert htext == "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima"

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: SpaceAppsXploration/whoosh

 def test_null_fragment(self):
     terms = frozenset(("bravo", "india"))
     sa = analysis.StandardAnalyzer()
     nf = highlight.NullFragmenter
     uc = highlight.UppercaseFormatter()
     htext = highlight.highlight(self._doc, terms, sa, nf, uc)
     self.assertEqual(htext, "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima")

Exemplo n.º 11

0

Exibir arquivo

Arquivo: mythread.py Projeto: kit119/E-Tipitaka-for-PC

    def run(self):
        termset = []
        keywords = self.keywords.replace('+',' ')
        #if self.lang == 'thai' or self.lang == 'thaimm' or self.lang == 'thaiwn' or self.lang == 'thaimc':
        #    for t in self.segmenter(keywords):
        #        termset.append(t.text)
        #elif self.lang == 'pali':

        for t in keywords.split():
            termset.append(t)                

        tmp = []
        wx.CallAfter(self.window.DisplayStarted)
        key = '%d:%d'%self.p
        if key not in dataModel:
            for i,r in enumerate(self.results[self.p[0]:self.p[1]]):
                nMin = min([len(t) for t in termset])
                nMax = max([len(t) for t in termset])
                excerpts = highlight(r['content'],
                                     termset,NgramTokenizer(nMin,nMax),
                                     SimpleFragmenter(size=70),
                                     MyHtmlFormatter(tagname='font',attrs='size="4" color="purple"'))
                
                if self.lang == 'pali' and 'wxMac' not in wx.PlatformInfo:
                    excerpts = excerpts.replace(u'ฐ',u'\uf700').replace(u'ญ',u'\uf70f').replace(u'\u0e4d',u'\uf711')
                tmp.append((self.p[0]+i+1,r['volumn'].lstrip(u'0'),r['page'].lstrip(u'0'),r['items'],excerpts))
                wx.CallAfter(self.window.UpdateProgress, (i+1)*10)
            dataModel[key] = tmp
        wx.CallAfter(self.window.DisplayFinished)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: SpaceAppsXploration/whoosh

 def test_context_fragment(self):
     terms = frozenset(("bravo", "india"))
     sa = analysis.StandardAnalyzer()
     cf = highlight.ContextFragmenter(terms, surround=6)
     uc = highlight.UppercaseFormatter()
     htext = highlight.highlight(self._doc, terms, sa, cf, uc)
     self.assertEqual(htext, "alfa BRAVO charlie...hotel INDIA juliet")

Exemplo n.º 13

0

Exibir arquivo

def test_issue324():
    sa = analysis.StemmingAnalyzer()
    result = highlight.highlight(u("Indexed!\n1"), [u("index")],
                                 sa,
                                 fragmenter=highlight.ContextFragmenter(),
                                 formatter=highlight.UppercaseFormatter())
    assert result == "INDEXED!\n1"

Exemplo n.º 14

0

Exibir arquivo

 def test_simple_fragment(self):
     terms = frozenset(("bravo", "india"))
     sa = analysis.StandardAnalyzer()
     sf = highlight.SimpleFragmenter(size=20)
     uc = highlight.UppercaseFormatter()
     htext = highlight.highlight(self._doc, terms, sa, sf, uc)
     self.assertEqual(htext, "alfa BRAVO charlie...hotel INDIA juliet kilo")

Exemplo n.º 15

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: SpaceAppsXploration/whoosh

 def test_html_format(self):
     terms = frozenset(("bravo", "india"))
     sa = analysis.StandardAnalyzer()
     cf = highlight.ContextFragmenter(terms, surround=6)
     hf = highlight.HtmlFormatter()
     htext = highlight.highlight(self._doc, terms, sa, cf, hf)
     self.assertEqual(htext, 'alfa <strong class="match term0">bravo</strong> charlie...hotel <strong class="match term1">india</strong> juliet')

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: SpaceAppsXploration/whoosh

 def test_simple_fragment(self):
     terms = frozenset(("bravo", "india"))
     sa = analysis.StandardAnalyzer()
     sf = highlight.SimpleFragmenter(size=20)
     uc = highlight.UppercaseFormatter()
     htext = highlight.highlight(self._doc, terms, sa, sf, uc)
     self.assertEqual(htext, "alfa BRAVO charlie...hotel INDIA juliet kilo")

Exemplo n.º 17

0

Exibir arquivo

def test_context_at_start():
    terms = frozenset(["alfa"])
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=15)
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, uc)
    assert htext == "ALFA bravo charlie delta echo foxtrot"

Exemplo n.º 18

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: JunjieHu/dl

def test_context_at_start():
    terms = frozenset(["alfa"])
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=15)
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, uc)
    assert htext == "ALFA bravo charlie delta echo foxtrot"

Exemplo n.º 19

0

Exibir arquivo

def test_html_format():
    terms = frozenset(("bravo", "india"))
    sa = analysis.StandardAnalyzer()
    cf = highlight.ContextFragmenter(surround=6)
    hf = highlight.HtmlFormatter()
    htext = highlight.highlight(_doc, terms, sa, cf, hf)
    assert htext == 'alfa <strong class="match term0">bravo</strong> charlie...hotel <strong class="match term1">india</strong> juliet'

Exemplo n.º 20

0

Exibir arquivo

Arquivo: whoosh_backend.py Projeto: mthornhill/django-haystack

 def _process_results(self, raw_results, highlight=False, query_string=''):
     from haystack import site
     results = []
     hits = len(raw_results)
     facets = {}
     spelling_suggestion = None
     indexed_models = site.get_indexed_models()
     
     for doc_offset, raw_result in enumerate(raw_results):
         raw_result = dict(raw_result)
         app_label, model_name = raw_result['django_ct'].split('.')
         additional_fields = {}
         
         for key, value in raw_result.items():
             additional_fields[str(key)] = self._to_python(value)
         
         del(additional_fields['django_ct'])
         del(additional_fields['django_id'])
         
         if highlight:
             from whoosh import analysis
             from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
             sa = analysis.StemmingAnalyzer()
             terms = [term.replace('*', '') for term in query_string.split()]
             
             # DRL_FIXME: Highlighting doesn't seem to work properly in testing.
             additional_fields['highlighted'] = {
                 self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
             }
         
         # Requires Whoosh 0.1.20+.
         if hasattr(raw_results, 'score'):
             score = raw_results.score(doc_offset)
         else:
             score = None
         
         if score is None:
             score = 0
         
         model = get_model(app_label, model_name)
         
         if model:
             if model in indexed_models:
                 result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields)
                 results.append(result)
             else:
                 hits -= 1
         else:
             hits -= 1
     
     if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
         spelling_suggestion = self.create_spelling_suggestion(query_string)
     
     return {
         'results': results,
         'hits': hits,
         'facets': facets,
         'spelling_suggestion': spelling_suggestion,
     }

Exemplo n.º 21

0

Exibir arquivo

def test_html_escape():
    terms = frozenset(["bravo"])
    sa = analysis.StandardAnalyzer()
    wf = highlight.WholeFragmenter()
    hf = highlight.HtmlFormatter()
    htext = highlight.highlight(u('alfa <bravo "charlie"> delta'), terms, sa,
                                wf, hf)
    assert htext == 'alfa &lt;<strong class="match term0">bravo</strong> "charlie"&gt; delta'

Exemplo n.º 22

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: JunjieHu/dl

def test_html_escape():
    terms = frozenset(["bravo"])
    sa = analysis.StandardAnalyzer()
    wf = highlight.WholeFragmenter()
    hf = highlight.HtmlFormatter()
    htext = highlight.highlight(u('alfa <bravo "charlie"> delta'), terms, sa,
                                wf, hf)
    assert htext == 'alfa &lt;<strong class="match term0">bravo</strong> "charlie"&gt; delta'

Exemplo n.º 23

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: SpaceAppsXploration/whoosh

 def test_sentence_fragment(self):
     text = u"This is the first sentence. This one doesn't have the word. This sentence is the second. Third sentence here."
     terms = ("sentence", )
     sa = analysis.StandardAnalyzer(stoplist=None)
     sf = highlight.SentenceFragmenter()
     uc = highlight.UppercaseFormatter()
     htext = highlight.highlight(text, terms, sa, sf, uc)
     self.assertEqual(htext, "This is the first SENTENCE...This SENTENCE is the second...Third SENTENCE here")

Exemplo n.º 24

0

Exibir arquivo

Arquivo: models.py Projeto: jiyinhe/fedweb_ui

 def get_highlighted_summary(self, summary, query, analyzer, frag, format):
     summary = unicode(summary.replace("\n", " "))
     if len(summary) > 350:
         summary = unicode(summary.replace("\n", " "))[0:350] + "..."
     hl = highlight(summary, query, analyzer, frag, format)
     if hl:
         return hl
     else:
         return summary

Exemplo n.º 25

0

Exibir arquivo

Arquivo: models.py Projeto: jiyinhe/fedweb_ui

	def get_highlighted_summary(self,summary,query, analyzer,frag,format):
		summary = unicode(summary.replace('\n', ' '))
		if len(summary) > 350:
			summary = unicode(summary.replace('\n', ' '))[0:350]+'...'
		hl = highlight(summary,query,analyzer,frag,format)
		if hl:
			return hl
		else:
			return summary

Exemplo n.º 26

0

Exibir arquivo

def test_sentence_fragment():
    text = u("This is the first sentence. This one doesn't have the word. " +
             "This sentence is the second. Third sentence here.")
    terms = ("sentence", )
    sa = analysis.StandardAnalyzer(stoplist=None)
    sf = highlight.SentenceFragmenter()
    uc = highlight.UppercaseFormatter()
    htext = highlight.highlight(text, terms, sa, sf, uc)
    assert htext == "This is the first SENTENCE...This SENTENCE is the second...Third SENTENCE here"

Exemplo n.º 27

0

Exibir arquivo

 def test_null_fragment(self):
     terms = frozenset(("bravo", "india"))
     sa = analysis.StandardAnalyzer()
     nf = highlight.NullFragmenter
     uc = highlight.UppercaseFormatter()
     htext = highlight.highlight(self._doc, terms, sa, nf, uc)
     self.assertEqual(
         htext,
         "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima"
     )

Exemplo n.º 28

0

Exibir arquivo

Arquivo: __init__.py Projeto: lmamsen/rhodecode

 def highlight(self, content, top=5):
     if self.search_type != 'content':
         return ''
     hl = highlight(escape(content),
              self.highlight_items,
              analyzer=ANALYZER,
              fragmenter=FRAGMENTER,
              formatter=FORMATTER,
              top=top)
     return hl

Exemplo n.º 29

0

Exibir arquivo

 def highlight(self, content, top=5):
     if self.search_type != 'content':
         return ''
     hl = highlight(text=escape(content),
                    terms=self.highlight_items,
                    analyzer=ANALYZER,
                    fragmenter=FRAGMENTER,
                    formatter=FORMATTER,
                    top=top)
     return hl

Exemplo n.º 30

0

Exibir arquivo

 def highlight(self, content, top=5):
     if self.search_type not in ['content', 'message']:
         return ''
     hl = highlight(text=content,
                    terms=self.highlight_items,
                    analyzer=ANALYZER,
                    fragmenter=FRAGMENTER,
                    formatter=FORMATTER,
                    top=top)
     return hl

Exemplo n.º 31

0

Exibir arquivo

 def get_object(self):
     indexer = Indexer('file')
     query = self.request.resolver_match.kwargs['query']
     docs = indexer.get_doc(url=self.request.resolver_match.kwargs['url'])
     if not len(docs):
         return {}
     query_list = query.split(' ')
     excerpts = highlight(docs[0]['body'], set(query_list),
                          StandardAnalyzer(), WholeFragmenter(),
                          HtmlFormatter())
     return {'body': excerpts, 'title': docs[0]['title']}

Exemplo n.º 32

0

Exibir arquivo

Arquivo: __init__.py Projeto: yujiro/rhodecode

 def highlight(self, content, top=5):
     if self.search_type not in ['content', 'message']:
         return ''
     hl = highlight(
         text=content,
         terms=self.highlight_items,
         analyzer=ANALYZER,
         fragmenter=FRAGMENTER,
         formatter=FORMATTER,
         top=top
     )
     return hl

Exemplo n.º 33

0

Exibir arquivo

def test_workflow_manual():
    schema = fields.Schema(id=fields.ID(stored=True),
                           title=fields.TEXT(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("1"), title=u("The man who wasn't there"))
    w.add_document(id=u("2"), title=u("The dog who barked at midnight"))
    w.add_document(id=u("3"), title=u("The invisible man"))
    w.add_document(id=u("4"), title=u("The girl with the dragon tattoo"))
    w.add_document(id=u("5"), title=u("The woman who disappeared"))
    w.commit()

    with ix.searcher() as s:
        # Parse the user query
        parser = qparser.QueryParser("title", schema=ix.schema)
        q = parser.parse(u("man"))

        # Extract the terms the user used in the field we're interested in
        terms = [
            text for fieldname, text in q.all_terms() if fieldname == "title"
        ]

        # Perform the search
        r = s.search(q)
        assert_equal(len(r), 2)

        # Use the same analyzer as the field uses. To be sure, you can
        # do schema[fieldname].analyzer. Be careful not to do this
        # on non-text field types such as DATETIME.
        analyzer = schema["title"].analyzer

        # Since we want to highlight the full title, not extract fragments,
        # we'll use WholeFragmenter.
        nf = highlight.WholeFragmenter()

        # In this example we'll simply uppercase the matched terms
        fmt = highlight.UppercaseFormatter()

        outputs = []
        for d in r:
            text = d["title"]
            outputs.append(highlight.highlight(text, terms, analyzer, nf, fmt))

        assert_equal(outputs,
                     ["The invisible MAN", "The MAN who wasn't there"])

Exemplo n.º 34

0

Exibir arquivo

Arquivo: whooshSearchEngine.py Projeto: linuxer9/thawab

    def resultExcerpt(self, results, i, ki=None):
        # FIXME: this should not be implementation specific
        if not ki:
            r = results[i]
            name = r['kitab']
            v = r['vrr'].split('-')[0]
            m = self.th.getMeta().getLatestKitabV(name, v)
            ki = self.th.getCachedKitab(m['uri'])
        num = int(results[i]['nodeIdNum'])
        node = ki.getNodeByIdNum(num)
        n = ki.toc.next(node)

        if n:
            ub = n.globalOrder
        else:
            ub = -1
        txt = node.toText(ub)

        s = set()
        #results.query.all_terms(s) # return (field,term) pairs
        # return (field,term) pairs    # self.self.__ix_searcher.reader()
        s = results.q.existing_terms(self.indexer.reader(), phrases=True)
        #s = set([i.decode('utf_8') for i in s])
        terms = dict(
            map(lambda i: (i[1], i[0]),
                filter(lambda j: j[0] == 'content' or j[0] == 'title',
                       s))).keys()
        #print "txt = [%s]" % len(txt)
        terms = [i.decode('utf_8') for i in terms]
        snippet_dummy = txt[:min(len(txt), 512)]  # dummy summary
        snippet = highlight(txt,
                            terms,
                            analyzer,
                            SentenceFragmenter(sentencechars=".!?؟\n"),
                            HtmlFormatter(between=u"\u2026\n"),
                            top=3,
                            scorer=BasicFragmentScorer,
                            minscore=1,
                            order=FIRST)
        #snippet = highlight(txt, terms, analyzer,
        #     SentenceFragmenter(sentencechars = ".!?"), ExcerptFormatter(between = u"\u2026\n"), top = 3,
        #     scorer = BasicFragmentScorer, minscore = 1,
        #     order = FIRST)
        print snippet
        if len(snippet) > 1: return snippet
        else: return snippet_dummy

Exemplo n.º 35

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: JunjieHu/dl

def test_workflow_manual():
    schema = fields.Schema(id=fields.ID(stored=True),
                           title=fields.TEXT(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("1"), title=u("The man who wasn't there"))
    w.add_document(id=u("2"), title=u("The dog who barked at midnight"))
    w.add_document(id=u("3"), title=u("The invisible man"))
    w.add_document(id=u("4"), title=u("The girl with the dragon tattoo"))
    w.add_document(id=u("5"), title=u("The woman who disappeared"))
    w.commit()

    with ix.searcher() as s:
        # Parse the user query
        parser = qparser.QueryParser("title", schema=ix.schema)
        q = parser.parse(u("man"))

        # Extract the terms the user used in the field we're interested in
        terms = [text for fieldname, text in q.all_terms()
                 if fieldname == "title"]

        # Perform the search
        r = s.search(q)
        assert len(r) == 2

        # Use the same analyzer as the field uses. To be sure, you can
        # do schema[fieldname].analyzer. Be careful not to do this
        # on non-text field types such as DATETIME.
        analyzer = schema["title"].analyzer

        # Since we want to highlight the full title, not extract fragments,
        # we'll use WholeFragmenter.
        nf = highlight.WholeFragmenter()

        # In this example we'll simply uppercase the matched terms
        fmt = highlight.UppercaseFormatter()

        outputs = []
        for d in r:
            text = d["title"]
            outputs.append(highlight.highlight(text, terms, analyzer, nf, fmt))

        assert outputs == ["The invisible MAN", "The MAN who wasn't there"]

Exemplo n.º 36

0

Exibir arquivo

Arquivo: whooshIndexing.py Projeto: massyah/ResearchBrowser

def searchBodyAndHighlight(q):
	parser = SimpleParser("body", schema=ix.schema)
	q = parser.parse(q)
	terms = [text for fieldname, text in q.all_terms()
	        if fieldname == "body"]

	r = s.search(q)
	analyzer = schema["body"].format.analyzer
	print "will tokenize with",q.all_terms
	fragmenter = highlight.ContextFragmenter(q.all_terms,400,80)
	# formatter = highlight.HtmlFormatter()
	formatter = colorIpythonFormatter

	for d in r:
		# The text argument to highlight is the stored text of the title
		text = d["body"]
		res= highlight.highlight(text, terms, analyzer,fragmenter, formatter)
		# print res.encode("latin-1","replace")
		print unicodedata.normalize('NFKC', res).encode("utf-8","replace")
		print "-"*8

Exemplo n.º 37

0

Exibir arquivo

    def run(self):
        termset = []
        keywords = self.keywords.replace('+', ' ').replace('|', ' ')
        keywords = ' '.join(
            filter(lambda x: x.find('v:') != 0, keywords.split()))

        for t in keywords.split():
            termset.append(t)

        items = []
        wx.CallAfter(self.window.DisplayStarted)
        key = '%d:%d' % self.p
        if key not in dataModel:
            for i, r in enumerate(self.results[self.p[0]:self.p[1]]):
                nMin = min([len(t) for t in termset])
                nMax = max([len(t) for t in termset])
                excerpts = highlight(
                    r['content'], termset, NgramTokenizer(nMin, nMax),
                    SimpleFragmenter(size=70),
                    MyHtmlFormatter(tagname='font',
                                    attrs='size="4" color="purple"'))

                if self.lang == 'pali' and 'wxMac' not in wx.PlatformInfo:
                    excerpts = excerpts.replace(u'ฐ', u'\uf700').replace(
                        u'ญ', u'\uf70f').replace(u'\u0e4d', u'\uf711')

                if self.lang != 'thaibt':
                    items.append(
                        (self.p[0] + i + 1, r['volume'].lstrip(u'0'),
                         r['page'].lstrip(u'0'), r['items'], excerpts))
                else:
                    items.append((self.p[0] + i + 1, unicode(r['volume']),
                                  unicode(r['page']), u'0', excerpts))

                wx.CallAfter(self.window.UpdateProgress, (i + 1) * 10)
            dataModel[key] = items
        wx.CallAfter(self.window.DisplayFinished)

Exemplo n.º 38

0

Exibir arquivo

Arquivo: whoosh_backend.py Projeto: JoeGermuska/django-haystack

 def _process_results(self, raw_results, highlight=False, query_string=''):
     results = []
     facets = {}
     
     for raw_result in raw_results:
         raw_result = dict(raw_result)
         app_label, module_name = raw_result['django_ct_s'].split('.')
         additional_fields = {}
         
         for key, value in raw_result.items():
             additional_fields[str(key)] = self._to_python(value)
         
         del(additional_fields['django_ct_s'])
         del(additional_fields['django_id_s'])
         # DRL_FIXME: Figure out if there's a way to get the score out of Whoosh.
         # del(additional_fields['score'])
         
         if highlight:
             from whoosh import analysis
             from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
             sa = analysis.StemmingAnalyzer()
             terms = [term.replace('*', '') for term in query_string.split()]
             
             # DRL_FIXME: Highlighting doesn't seem to work properly in testing.
             additional_fields['highlighted'] = {
                 self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
             }
         
         result = SearchResult(app_label, module_name, raw_result['django_id_s'], raw_result.get('score', 0), **additional_fields)
         results.append(result)
     
     return {
         'results': results,
         'hits': len(results),
         'facets': facets,
     }

Exemplo n.º 39

0

Exibir arquivo

Arquivo: whoosh_backend.py Projeto: stard0g/django-haystack

    def _process_results(self, raw_page, highlight=False, query_string="", spelling_query=None, result_class=None):
        from haystack import site

        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        indexed_models = site.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split(".")
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = site.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(index.fields[string_key], "convert"):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(",")
                        else:
                            additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter

                    sa = analysis.StemmingAnalyzer()
                    terms = [term.replace("*", "") for term in query_string.split()]

                    additional_fields["highlighted"] = {
                        self.content_field_name: [
                            highlight(
                                additional_fields.get(self.content_field_name),
                                terms,
                                sa,
                                ContextFragmenter(terms),
                                UppercaseFormatter(),
                            )
                        ]
                    }

                result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if getattr(settings, "HAYSTACK_INCLUDE_SPELLING", False):
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)

        return {"results": results, "hits": hits, "facets": facets, "spelling_suggestion": spelling_suggestion}

Exemplo n.º 40

0

Exibir arquivo

Arquivo: whooshIndexing.py Projeto: massyah/ResearchBrowser

analyzer = schema["title"].format.analyzer

# Since we want to highlight the full title, not extract fragments,
# we'll use NullFragmenter. See the docs for the highlight module
# for which fragmenters are available.
fragmenter = highlight.NullFragmenter

# This object controls what the highlighted output looks like.
# See the docs for its arguments.
formatter = highlight.HtmlFormatter()

for d in r:
   # The text argument to highlight is the stored text of the title
   text = d["title"]

   print highlight.highlight(text, terms, analyzer,
                             fragmenter, formatter)

#do it on the body now
def colorIpythonFormatter(text,fragments):
	resultFrags=[]
	addedChars=0 #each highligh add 11 chars
	for f in fragments:
		for tok in f.matches:
			text=text[:tok.startchar+addedChars]+tc.Red+text[tok.startchar+addedChars:tok.endchar+addedChars]+tc.Normal+text[tok.endchar+addedChars:]
			addedChars+=11
		resultFrags.append(text[f.startchar+addedChars:f.endchar+15+addedChars])			
	return " [...] ".join(resultFrags)

def colorIpythonFormatter(text,fragments):
	resultFrags=[]
	addedChars=0 #each highligh add 11 chars

Exemplo n.º 41

0

Exibir arquivo

Arquivo: search.py Projeto: toastdriven/lilrocket

 
 for doc_offset, raw_result in enumerate(raw_results):
     raw_result = dict(raw_result)
     final_result = {}
     
     for key, value in raw_result.items():
         final_result[str(key)] = self._to_python(value)
     
     if highlight:
         from whoosh import analysis
         from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
         sa = analysis.StemmingAnalyzer()
         terms = [term.replace('*', '') for term in query_string.split()]
         
         # DRL_FIXME: Highlighting doesn't seem to work properly in testing.
         results['highlighted'][self.content_field_name] = [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())]
     
     # Requires Whoosh 0.1.20+.
     if hasattr(raw_results, 'score'):
         final_result['score'] = raw_results.score(doc_offset)
     else:
         final_result['score'] = 0
     
     results['docs'].append(final_result)
 
 if self.include_spelling:
     results['spelling_suggestion'] = self.create_spelling_suggestion(query_string)
 
 # DRL_FIXME: This needs to be corrected.
 results['hits'] = len(results['docs'])
 return results

Exemplo n.º 42

0

Exibir arquivo

Arquivo: whoosh_backend.py Projeto: overflow/django-haystack

    def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(',')
                        else:
                            additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del(additional_fields[DJANGO_CT])
                del(additional_fields[DJANGO_ID])

                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                    sa = analysis.StemmingAnalyzer()
                    terms = [term.replace('*', '') for term in query_string.split()]

                    additional_fields['highlighted'] = {
                        self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
                    }

                result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

Exemplo n.º 43

0

Exibir arquivo

    def _process_results(self,
                         raw_page,
                         highlight=False,
                         query_string='',
                         spelling_query=None,
                         result_class=None):
        from haystack import connections
        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)

        if result_class is None:
            result_class = SearchResult

        facets = {}
        spelling_suggestion = None
        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = unified_index.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(
                                    ',')
                        else:
                            additional_fields[string_key] = index.fields[
                                string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])

                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                    sa = analysis.StemmingAnalyzer()
                    terms = [
                        term.replace('*', '') for term in query_string.split()
                    ]

                    additional_fields['highlighted'] = {
                        self.content_field_name: [
                            highlight(
                                additional_fields.get(self.content_field_name),
                                terms, sa, ContextFragmenter(terms),
                                UppercaseFormatter())
                        ],
                    }

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID], score,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if self.include_spelling:
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(
                    spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(
                    query_string)

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

Exemplo n.º 44

0

Exibir arquivo

Arquivo: whoosh_index.py Projeto: s-vitaliy/devpi

 def highlight(self, text, words):
     fragmenter = ContextFragmenter()
     formatter = HtmlFormatter()
     analyzer = self.project_schema['text'].analyzer
     return highlight(text, words, analyzer, fragmenter, formatter, top=1)

Exemplo n.º 45

0

Exibir arquivo

Arquivo: search.py Projeto: okute/ProfessionalWebsite

def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {'content': 1.0, 'title': 3.0}
    query_parser = MultifieldParser(['content', 'title'],
                                    schema=index_schema,
                                    fieldboosts=field_boosts,
                                    group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content':
            highlight(search_result['content'], search_terms, analyzer,
                      fragmenter, formatter),
            'url':
            search_result['url'],
            'title':
            search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count

Exemplo n.º 46

0

Exibir arquivo

Arquivo: search.py Projeto: jtedesco/ProfessionalWebsite

def run_query(query, index):
    """
      Queries the index for data with the given text query

        @param  query   The text query to perform on the indexed data
        @return			A list of HTMl string snippets to return
    """

    # Create a searcher object for this index
    searcher = index.searcher()

    # Create a query parser that will parse multiple fields of the documents
    field_boosts = {
        'content': 1.0,
        'title': 3.0
    }
    query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup)

    # Build a query object from the query string
    query_object = query_parser.parse(query)

    # Build a spell checker in this index and add the "content" field to the spell checker
    spell_checker = SpellChecker(index.storage)
    spell_checker.add_field(index, 'content')
    spell_checker.add_field(index, 'title')

    # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results
    search_terms = [text for fieldname, text in query_object.all_terms()]

    # Remove terms that are too short
    for search_term in search_terms:
        if len(search_term) <= 3:
            search_terms.remove(search_term)

    # Perform the query itself
    search_results = searcher.search(query_object)

    # Get an analyzer for analyzing the content of each page for highlighting
    analyzer = index_schema['content'].format.analyzer

    # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts
    #   by 'context' in the content
    fragmenter = ContextFragmenter(frozenset(search_terms))

    # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to
    #   highlight the results
    formatter = HtmlFormatter()

    # Iterate through the search results, highlighting and counting the results
    result_count = 0
    results = []
    for search_result in search_results:
        # Collect this search result
        results.append({
            'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter),
            'url': search_result['url'],
            'title': search_result['title']
        })
        result_count += 1

    # Build a list of 'suggest' words using the spell checker
    suggestions = []
    for term in search_terms:
        suggestions.append(spell_checker.suggest(term))

    # Return the list of web pages along with the terms used in the search
    return results, search_terms, suggestions, result_count

Exemplo n.º 47

0

Exibir arquivo

def parse_response(reader,
                   fieldname,
                   analyzer,
                   fragmenter,
                   formatter,
                   query,
                   results,
                   results_are_page=False):
    """
    Returns an ifind Response, given a query and set of results from Whoosh/Redis.
    Takes an ifind Query object and a list of SORTED results for the given query.

    If the page requested (query.skip) is < 0, page 1 is returned.
    If the page requested is greater than the number of available pages, the last page is returned.
    """
    def get_term_list():
        if isinstance(query.parsed_terms, unicode):
            return [query.parsed_terms]

        return [
            text for term_fieldname, text in query.parsed_terms.all_terms()
            if fieldname == fieldname
        ]

    response = Response(query.terms)
    response.results_total = len(results)

    if results_are_page:
        page = results[0]
        response.total_pages = results[1]
        results = results[2]
    else:
        page, response.total_pages, results = get_page(query, results)

    page_len = query.top

    i = 0

    for result in results:
        i = i + 1
        rank = (page - 1) * page_len + i
        whoosh_docnum = result[0]
        score = result[1]
        stored_data = reader.stored_fields(whoosh_docnum)

        title = stored_data['title']

        if title:
            title = title.strip()
        else:
            title = "Untitled Document"

        url = "/treconomics/{0}/".format(whoosh_docnum)
        trecid = stored_data['docid'].strip()
        source = stored_data['source'].strip()

        summary = highlight(stored_data['content'], get_term_list(), analyzer,
                            fragmenter, formatter)
        summary = "{0}...".format(summary)

        response.add_result(title=title,
                            url=url,
                            summary=summary,
                            docid=trecid,
                            source=source,
                            rank=rank,
                            whooshid=whoosh_docnum,
                            score=score)

    # The following two lines are for compatibility purposes with the existing codebase.
    # Would really like to take these out.
    setattr(response, 'results_on_page', len(results))
    setattr(response, 'actual_page', page)

    return response

Exemplo n.º 48

0

Exibir arquivo

Arquivo: whoosh_backend.py Projeto: thoblr/django-haystack

    def _process_results(
        self, raw_results, start_offset, end_offset, highlight=False, query_string="", spelling_query=None
    ):
        from haystack import site

        results = []

        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_results)
        raw_results = raw_results[start_offset:end_offset]

        facets = {}
        spelling_suggestion = None
        indexed_models = site.get_indexed_models()

        for doc_offset, raw_result in enumerate(raw_results):
            raw_result = dict(raw_result)
            app_label, model_name = raw_result["django_ct"].split(".")
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = site.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(index.fields[string_key], "convert"):
                        additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)

                del (additional_fields["django_ct"])
                del (additional_fields["django_id"])

                if highlight:
                    from whoosh import analysis
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter

                    sa = analysis.StemmingAnalyzer()
                    terms = [term.replace("*", "") for term in query_string.split()]

                    # DRL_FIXME: Highlighting doesn't seem to work properly in testing.
                    additional_fields["highlighted"] = {
                        self.content_field_name: [
                            highlight(
                                additional_fields.get(self.content_field_name),
                                terms,
                                sa,
                                ContextFragmenter(terms),
                                UppercaseFormatter(),
                            )
                        ]
                    }

                # Requires Whoosh 0.1.20+.
                if hasattr(raw_results, "score"):
                    score = raw_results.score(doc_offset)
                else:
                    score = None

                if score is None:
                    score = 0

                result = SearchResult(app_label, model_name, raw_result["django_id"], score, **additional_fields)
                results.append(result)
            else:
                hits -= 1

        if getattr(settings, "HAYSTACK_INCLUDE_SPELLING", False):
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)

        return {"results": results, "hits": hits, "facets": facets, "spelling_suggestion": spelling_suggestion}

Exemplo n.º 49

0

Exibir arquivo

Arquivo: test_highlighting.py Projeto: JunjieHu/dl

def test_issue324():
    sa = analysis.StemmingAnalyzer()
    result = highlight.highlight(u("Indexed!\n1"), [u("index")], sa,
                                 fragmenter=highlight.ContextFragmenter(),
                                 formatter=highlight.UppercaseFormatter())
    assert result == "INDEXED!\n1"

Exemplo n.º 50

0

Exibir arquivo

 def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None):
     from haystack import site
     results = []
     
     # It's important to grab the hits first before slicing. Otherwise, this
     # can cause pagination failures.
     hits = len(raw_page)
     
     facets = {}
     spelling_suggestion = None
     indexed_models = site.get_indexed_models()
     
     for doc_offset, raw_result in enumerate(raw_page):
         score = raw_page.score(doc_offset) or 0
         app_label, model_name = raw_result['django_ct'].split('.')
         additional_fields = {}
         model = get_model(app_label, model_name)
         
         if model and model in indexed_models:
             for key, value in raw_result.items():
                 index = site.get_index(model)
                 string_key = str(key)
                 
                 if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                     # Special-cased due to the nature of KEYWORD fields.
                     if isinstance(index.fields[string_key], MultiValueField):
                         if value is None or len(value) is 0:
                             additional_fields[string_key] = []
                         else:
                             additional_fields[string_key] = value.split(',')
                     else:
                         additional_fields[string_key] = index.fields[string_key].convert(value)
                 else:
                     additional_fields[string_key] = self._to_python(value)
             
             del(additional_fields['django_ct'])
             del(additional_fields['django_id'])
             
             if highlight:
                 from whoosh import analysis
                 from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                 sa = analysis.StemmingAnalyzer()
                 terms = [term.replace('*', '') for term in query_string.split()]
                 
                 additional_fields['highlighted'] = {
                     self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
                 }
             
             result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields)
             results.append(result)
         else:
             hits -= 1
     
     if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
         if spelling_query:
             spelling_suggestion = self.create_spelling_suggestion(spelling_query)
         else:
             spelling_suggestion = self.create_spelling_suggestion(query_string)
     
     return {
         'results': results,
         'hits': hits,
         'facets': facets,
         'spelling_suggestion': spelling_suggestion,
     }

Exemplo n.º 51

0

Exibir arquivo

Arquivo: whooshtrecnewsredis.py Projeto: pombredanne/ifind

def parse_response(reader, fieldname, analyzer, fragmenter, formatter, query, results, results_are_page=False):
    """
    Returns an ifind Response, given a query and set of results from Whoosh/Redis.
    Takes an ifind Query object and a list of SORTED results for the given query.

    If the page requested (query.skip) is < 0, page 1 is returned.
    If the page requested is greater than the number of available pages, the last page is returned.
    """

    def get_term_list():
        if isinstance(query.parsed_terms, unicode):
            return [query.parsed_terms]

        return [text for term_fieldname, text in query.parsed_terms.all_terms() if fieldname == fieldname]

    response = Response(query.terms)
    response.results_total = len(results)

    if results_are_page:
        page = results[0]
        response.total_pages = results[1]
        results = results[2]
    else:
        page, response.total_pages, results = get_page(query, results)

    page_len = query.top

    i = 0

    for result in results:
        i = i + 1
        rank = (page - 1) * page_len + i
        whoosh_docnum = result[0]
        score = result[1]
        stored_data = reader.stored_fields(whoosh_docnum)

        title = stored_data["title"]

        if title:
            title = title.strip()
        else:
            title = "Untitled Document"

        url = "/treconomics/{0}/".format(whoosh_docnum)
        trecid = stored_data["docid"].strip()
        source = stored_data["source"].strip()

        summary = highlight(stored_data["content"], get_term_list(), analyzer, fragmenter, formatter)
        summary = "{0}...".format(summary)

        response.add_result(
            title=title,
            url=url,
            summary=summary,
            docid=trecid,
            source=source,
            rank=rank,
            whooshid=whoosh_docnum,
            score=score,
        )

    # The following two lines are for compatibility purposes with the existing codebase.
    # Would really like to take these out.
    setattr(response, "results_on_page", len(results))
    setattr(response, "actual_page", page)

    return response