Exemplo n.º 1
0
	def get_ranklist(self, topic_id, run_id, session_id):
		res = self.get(topic_id=topic_id, run_id=run_id)
		frag = WholeFragmenter()
		analyzer = FancyAnalyzer()
		format = HtmlFormatter(tagname="b")

		ranklist = js.loads(res.ranklist)
		# To keep the ranking
		id_ranklist = dict([(ranklist[i], i) for i in range(len(ranklist))])
		docs = Document.objects.filter(doc_id__in=ranklist)
		bookmarks = Bookmark.objects.filter(topic_id=topic_id,
										session_id=session_id)
		# get the query for highlighting
		query = [q.text for q in analyzer(Topic.objects.get(topic_id=topic_id).topic_text)]
		# get the docids
		bookmarks = [b.doc_id.strip("_bookmark") for b in bookmarks]
		bookmarks = set(bookmarks) # faster lookup
		ids = [(d.summary, d.doc_id) for d in docs]	
		for (sum,id) in ids[:10]:
			print 
		docs = [[id_ranklist[d.doc_id], 
			{
				'id':d.doc_id, 
				'title': '.' if d.title=='' else d.title, 
				'url': d.url if len(d.url)<=80 else d.url[0:80]+'...', 
				'summary':self.get_highlighted_summary(d.summary,query,analyzer,frag,format),
				'site': d.site.site_name,
				'category': d.site.category.split(","),
				'bookmarked': 1 if d.doc_id in bookmarks else 0
			}] for d in docs]
		docs.sort(key=operator.itemgetter(0))
		
		return docs 
Exemplo n.º 2
0
    def search(self,
               query_str,
               limit=30,
               html=True,
               description=True,
               comments=False,
               search_comments=True,
               highlight=True):
        index = self._get_index()
        searcher = index.searcher()
        fields = ["summary", "description"]
        if search_comments:
            fields.append("comments_str")
        qp = MultifieldParser(fields, schema=index.schema)
        query = qp.parse(query_str)

        results = searcher.search(query, limit=limit)
        results.formatter = AnsiColorFormatter()
        results.fragmenter = WholeFragmenter()
        self.report(results)

        for hit in results:
            ticket = Ticket.get_by_id(hit["key"])
            text = Issue(ticket.data).to_string(with_description=description,
                                                with_comments=comments)
            if not html:
                text = self._html_to_text.handle(text)
            text = text.strip()
            if highlight and description:
                highlighted = hit.highlights("description", text=text)
                if highlighted:
                    text = highlighted
            self.report(text)
            if description or comments:
                self.report("-" * 80)
Exemplo n.º 3
0
    def get_results(self):
        if self.searcher is None:
            self.search()

        results = self.searcher.search(self.query, limit=None)
        results.fragmenter = WholeFragmenter()
        return results
Exemplo n.º 4
0
 def doSearch(self, text):
     q = self.qp.parse(text)          # build query with event-provided search key
     with self.ix.searcher(weighting = scoring.BM25F) as s:    # there are several NLP style scorers for Whoosh
         c = s.collector(limit=self.MaxResults)                # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long...
         c = TimeLimitCollector(c,0.5)               
         try:
             s.search_with_collector(q,c)
         except:
             print("TIMEOUT!")                       # DEBUG out put to console if we're timing out a lot  
         results = c.results()                       # If we do get a timeout, still return whatever we've got, i.e. partial results 
                                                     #-----------------------------------------------------
         self.searchResults.clear()                  # ** Now format the results for display ** 
         results.fragmenter = WholeFragmenter()      # we want the full technical name not just the local context.
         self.MaudeResults.clear()                  # Clear
         if len(results)> 0:
             self.results = [] 
             for res in results:
                 self.results.append(res['msid'])
                 HighLightedMsid = res.highlights('msid')  # construct MSID string with highlights, if that's where the match is... 
                 if len(HighLightedMsid) >0:
                     msid_str = HighLightedMsid
                 else:
                     msid_str = res['msid']
                 HighLightedTechName = res.highlights('technical_name')  # construct technical_name string with highlights, if relevant
                 if len(HighLightedTechName) >0:
                     tech_str = HighLightedTechName
                 else:
                     tech_str = res['technical_name']
                 self.searchResults.append(msid_str + ' - ' + tech_str)
         cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start)     # return cursor to beginning of search results     
Exemplo n.º 5
0
def searchPage(keyword, curPage=1, pagelen=10):
    with ix.searcher() as searcher:
        # res=dict()
        # parser = QueryParser('content', schema=ix.schema)
        hf = HtmlFormatter(tagname="code", classname="match", termclass="term")
        fragmenter = WholeFragmenter(charlimit=None)
        parser = MultifieldParser(["title", "content", 'createAt'],
                                  schema=ix.schema)
        parser.add_plugin(DateParserPlugin())
        q = parser.parse(keyword)
        page = searcher.search_page(q, curPage, pagelen)  #,terms=True
        page.results.fragmenter = fragmenter
        #page.results.fragmenter.charlimit=None
        page.results.formatter = hf
        # terms = page.results.matched_terms()
        # key=[ e for e in terms ][0][1].decode('UTF-8')
        resPage = dict(pagenum=page.pagenum,
                       pagecount=page.pagecount,
                       total=page.total,
                       posts=[])
        for hint in page:
            tmp = dict()
            tmp['title'] = hint.highlights("title", minscore=0)
            tmp['author'] = hint["author"]
            tmp['location'] = hint["location"].replace(os.sep,
                                                       '/').replace('//', '/')
            if tmp['location'].startswith('/'):
                tmp['location'] = tmp['location'][1:]
            tmp['summary'] = hint.highlights(
                "summary", minscore=0
            )  #hint["content"].replace(key,"<code>%s</code>" % key)

            resPage['posts'].append(tmp)
        return resPage
Exemplo n.º 6
0
 def get_object(self):
     indexer = Indexer('file')
     query = self.request.resolver_match.kwargs['query']
     docs = indexer.get_doc(url=self.request.resolver_match.kwargs['url'])
     if not len(docs):
         return {}
     query_list = query.split(' ')
     excerpts = highlight(docs[0]['body'], set(query_list),
                          StandardAnalyzer(), WholeFragmenter(),
                          HtmlFormatter())
     return {'body': excerpts, 'title': docs[0]['title']}
Exemplo n.º 7
0
    def search(self,
               collector,
               query_str1=None,
               query_str2=None,
               itemtypes=(),
               highlight=False):

        # rejects '*' and '?'
        if query_str1:
            for kw in (s.strip() for s in query_str1.split()):
                if not kw.replace("*", "").replace("?", "").strip():
                    return []

        wildcard = (query_str1 and any(c in query_str1 for c in "*?"))

        parser = self._parser_wild if wildcard else self._parser
        asf_parser = self._asf_parser

        with self._index.searcher() as searcher:
            andlist = []
            try:
                if query_str1:
                    andlist.append(parser.parse(query_str1))
                if query_str2:
                    andlist.append(asf_parser.parse(query_str2))
            except:
                return []

            if itemtypes:
                if len(itemtypes) > 1:
                    andlist.append(Or([Term('itemtype', t)
                                       for t in itemtypes]))
                else:
                    andlist.append(Term('itemtype', itemtypes[0]))

            query = And(andlist)

            searcher.search_with_collector(query, collector)
            hits = collector.results()

            if highlight:
                hits.fragmenter = WholeFragmenter()
                hits.formatter = HtmlFormatter(tagname='span',
                                               classname='s_match',
                                               termclass='s_term')

            if wildcard and query_str1:
                pat = query_str1.replace("-", "").replace(" ", "")
                wildmatch = re.compile(fnmatch.translate(pat))

            # Construct a result list
            results = []
            for hit in hits:
                if collector.aborted:
                    return []
                (label, path, prio, sortkey) = hit['data']

                if wildcard and query_str1:
                    if not wildmatch.match(sortkey):
                        continue

                if highlight:
                    if query_str1:
                        text = hit.highlights('content')
                    else:
                        text = hit['content']
                else:
                    text = None

                results.append((label, path, sortkey, prio, text))

            sortkey_prio_getter = itemgetter(2, 3)
            results.sort(key=sortkey_prio_getter)

            # Return
            return results

# customize highlight formatter
class HighlightFormatter(Formatter):
    def format_token(self, text, token, replace=False):
        # Use the get_text function to get the text corresponding to the
        # token
        tokentext = get_text(text, token, replace)

        # Return the text as you want it to appear in the highlighted
        # string
        return "<mark>%s<mark>" % tokentext


hf = HighlightFormatter()  # formatter for highlighting
wf = WholeFragmenter()  # fragmenter for splitting words
es_ana = LanguageAnalyzer("es")  # Whoosh analyzer for Spanish

# Load Whoosh index
index = open_dir("whoosh_index")

# Initialize Whoosh parser
parser = QueryParser("text", schema=index.schema)


@app.route("/")
def load_index():
    return render_template("index.html")


@app.route("/api/greguerias/all/", methods=['GET'])
Exemplo n.º 9
0
@app.context_processor
def inject_render_time():
    return dict(render_time=time.time() - g.start_time)

@app.route('/')
def index():
    return render_template('index.html') 

from whoosh.index import open_dir
ix = open_dir('index', 'mobi', readonly=True)

from whoosh.query import Or
from whoosh.qparser import QueryParser

from whoosh.highlight import WholeFragmenter
fragmenter = WholeFragmenter()

@app.route('/search/<name>')
def search(name):
    with ix.searcher() as searcher:
        query = Or([QueryParser("title", ix.schema).parse(name),
                    QueryParser("author", ix.schema).parse(name)])
        results = searcher.search(query, limit=100)
        results.fragmenter = fragmenter
        return render_template('index.html', results=results)

from sae.const import APP_NAME
def book_url(hit):
    return 'http://%s-%s.stor.sinaapp.com/%s/%s.mobi' % \
            (APP_NAME, hit['bucket'], hit['author'], hit['title'])