def get_ranklist(self, topic_id, run_id, session_id): res = self.get(topic_id=topic_id, run_id=run_id) frag = WholeFragmenter() analyzer = FancyAnalyzer() format = HtmlFormatter(tagname="b") ranklist = js.loads(res.ranklist) # To keep the ranking id_ranklist = dict([(ranklist[i], i) for i in range(len(ranklist))]) docs = Document.objects.filter(doc_id__in=ranklist) bookmarks = Bookmark.objects.filter(topic_id=topic_id, session_id=session_id) # get the query for highlighting query = [q.text for q in analyzer(Topic.objects.get(topic_id=topic_id).topic_text)] # get the docids bookmarks = [b.doc_id.strip("_bookmark") for b in bookmarks] bookmarks = set(bookmarks) # faster lookup ids = [(d.summary, d.doc_id) for d in docs] for (sum,id) in ids[:10]: print docs = [[id_ranklist[d.doc_id], { 'id':d.doc_id, 'title': '.' if d.title=='' else d.title, 'url': d.url if len(d.url)<=80 else d.url[0:80]+'...', 'summary':self.get_highlighted_summary(d.summary,query,analyzer,frag,format), 'site': d.site.site_name, 'category': d.site.category.split(","), 'bookmarked': 1 if d.doc_id in bookmarks else 0 }] for d in docs] docs.sort(key=operator.itemgetter(0)) return docs
def search(self): c.terms = request.GET.get('terms', '') c.results = [] if len(c.terms) < 4: h.flash( _('Search queries must be at least 4 characters in length.'), 'error' ) redirect(url(controller='blog', action='index')) query = MultifieldParser( ['title', 'content', 'summary'], schema=index.schema ).parse(c.terms) results = index.searcher().search(query, limit=10) for result in results: terms = [v for k, v in query.all_terms() if k == 'content'] url_kwargs = json.loads(result['url']) result['url'] = url(**url_kwargs) result['highlights'] = highlight( result['content'], terms, search.schema['content'].format.analyzer, ContextFragmenter(terms), HtmlFormatter(tagname='span', classname='highlight') ) c.results.append(result) return render('search.tpl', slacks=True)
def get_suggestion(q): ix = open_dir('index') with ix.searcher( ) as searcher: # Un searcher apre diversi file, quindi va preferibilmente usato con il with in modo da farli anche chiudere # og = OrGroup.factory(0.9) # Dalla documentazione ufficiale, maggiore priorità se ci sono più termini # parser = MultifieldParser(['title', 'content'], ix.schema, {'title': 1.0, 'content': 0.2}, group=og) # Dai priorità ai titoli # # parser.add_plugin(PlusMinusPlugin()) # Consente di specificare con + e - termini che ci devono essere o no # parser.add_plugin(FuzzyTermPlugin()) # Consente di cercare termini non precisi con la tilde (~) #parser.add_plugin(ForceFuzzyPlugin()) # Un mio hack, vedi dopo parser = QueryParser("nTitle", schema=ix.schema) query = parser.parse(q) #print(query) # Per fare debug results = searcher.search(query) results.formatter = HtmlFormatter( between=" &ellips; ") # Questioni di gusto print(results, "schema: ", ix.schema) for r in results: titleHigh = r.highlights("title") print(titleHigh) title = titleHigh if titleHigh else r[ 'title'] # Se non c'è il termine nel titolo, senza questa linea non verrebbe stampato nulla
def __init__(self, index=None, search_fields=["title", "content"], html_formatter=None, parser=None, termclass=Term): """Clase para buscar por distintos fields :param: index :type: whoosh.index.Index - Instancia del objeto Index :param: search_fields - Lista de los campos donde se busca :type: list :param: html_formatter - Instancia que formatea los hits :type: whoosh.highlight.HtmlFormatter """ self.index = index or open_dir(INDEX_DIR) self.html_formatter = html_formatter or HtmlFormatter( between="...", tagname="strong", classname="search-match", termclass="search-term") self.search_fields = search_fields self.termclass = termclass self.parser = parser or qparser.MultifieldParser( self.search_fields, self.index.schema, termclass=termclass)
def searchPage(keyword, curPage=1, pagelen=10): with ix.searcher() as searcher: # res=dict() # parser = QueryParser('content', schema=ix.schema) hf = HtmlFormatter(tagname="code", classname="match", termclass="term") fragmenter = WholeFragmenter(charlimit=None) parser = MultifieldParser(["title", "content", 'createAt'], schema=ix.schema) parser.add_plugin(DateParserPlugin()) q = parser.parse(keyword) page = searcher.search_page(q, curPage, pagelen) #,terms=True page.results.fragmenter = fragmenter #page.results.fragmenter.charlimit=None page.results.formatter = hf # terms = page.results.matched_terms() # key=[ e for e in terms ][0][1].decode('UTF-8') resPage = dict(pagenum=page.pagenum, pagecount=page.pagecount, total=page.total, posts=[]) for hint in page: tmp = dict() tmp['title'] = hint.highlights("title", minscore=0) tmp['author'] = hint["author"] tmp['location'] = hint["location"].replace(os.sep, '/').replace('//', '/') if tmp['location'].startswith('/'): tmp['location'] = tmp['location'][1:] tmp['summary'] = hint.highlights( "summary", minscore=0 ) #hint["content"].replace(key,"<code>%s</code>" % key) resPage['posts'].append(tmp) return resPage
def search(): q = qp.parse(request.args.get('query')) limit = int(request.args.get('max_docs')) if limit is None or limit < 1: limit = 10 results_list = [] with ix.searcher() as searcher: results = searcher.search(q, limit=limit) results.formatter = HtmlFormatter(tagname="span") n_results = len(results) for hit in results: result = dict() result["filename"] = hit["title"] with open(os.path.join(path, "data", hit["filename"]), encoding="utf-8") as file: filecontents = file.read() result["text"] = hit.highlights( "content", text=filecontents, ) result["path"] = "static/" + hit["filename"] results_list.append(result) response = make_response( json.dumps({ 'results': results_list, 'n_results': n_results })) response.headers["Access-Control-Allow-Origin"] = "*" response.headers['Content-Type'] = 'text/json' return response
def __getitem__(self, item): if item.start in self.saved_results: return self.saved_results[item.start] q, mask = self._query sortedby, reverse = self._query_sortedby page: ResultsPage = self.searcher.search_page( q, mask=mask, filter=self._query_filter, pagenum=math.floor(item.start / self.page_size) + 1, pagelen=self.page_size, sortedby=sortedby, reverse=reverse) page.results.fragmenter = highlight.ContextFragmenter(surround=50) page.results.formatter = HtmlFormatter(tagname="span", between=" ... ") if not self.first_score and len(page.results) > 0: self.first_score = page.results[0].score if self.first_score: page.results.top_n = list( map(lambda hit: (hit[0] / self.first_score, hit[1]), page.results.top_n)) self.saved_results[item.start] = page return page
def get_object(self): indexer = Indexer('file') query = self.request.resolver_match.kwargs['query'] docs = indexer.get_doc(url=self.request.resolver_match.kwargs['url']) if not len(docs): return {} query_list = query.split(' ') excerpts = highlight(docs[0]['body'], set(query_list), StandardAnalyzer(), WholeFragmenter(), HtmlFormatter()) return {'body': excerpts, 'title': docs[0]['title']}
def query_index(_query, no_results=20, htmlFormat=False, dir =''): ix = open_dir(dir) with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse('"|| '+_query+'"') results = searcher.search(query, limit=no_results) if htmlFormat: results.formatter = HtmlFormatter() return map(lambda y: {'index': y['index'], 'text': y.highlights('content')} , sorted(results, key=lambda k: k['index'])) else: return map(lambda y: {'index': y['index'], 'text': y['content']} , sorted(results, key=lambda k: k['index']))
def resultExcerpt(self, results, i, ki=None): # FIXME: this should not be implementation specific if not ki: r = results[i] name = r['kitab'] v = r['vrr'].split('-')[0] m = self.th.getMeta().getLatestKitabV(name, v) ki = self.th.getCachedKitab(m['uri']) num = int(results[i]['nodeIdNum']) node = ki.getNodeByIdNum(num) n = ki.toc.next(node) if n: ub = n.globalOrder else: ub = -1 txt = node.toText(ub) s = set() #results.query.all_terms(s) # return (field,term) pairs # return (field,term) pairs # self.self.__ix_searcher.reader() s = results.q.existing_terms(self.indexer.reader(), phrases=True) #s = set([i.decode('utf_8') for i in s]) terms = dict( map(lambda i: (i[1], i[0]), filter(lambda j: j[0] == 'content' or j[0] == 'title', s))).keys() #print "txt = [%s]" % len(txt) terms = [i.decode('utf_8') for i in terms] snippet_dummy = txt[:min(len(txt), 512)] # dummy summary snippet = highlight(txt, terms, analyzer, SentenceFragmenter(sentencechars=".!?؟\n"), HtmlFormatter(between=u"\u2026\n"), top=3, scorer=BasicFragmentScorer, minscore=1, order=FIRST) #snippet = highlight(txt, terms, analyzer, # SentenceFragmenter(sentencechars = ".!?"), ExcerptFormatter(between = u"\u2026\n"), top = 3, # scorer = BasicFragmentScorer, minscore = 1, # order = FIRST) print snippet if len(snippet) > 1: return snippet else: return snippet_dummy
def sear_paper(self): if not self.key_words or len(self.key_words)==0: return '0' q = [Term("textdata", k) for k in self.key_words] index = open_dir(self.path_index) # open_dir('paper-index') searcher = index.searcher() results = searcher.search(Or(q)) results.fragmenter.maxchars = 30000 results.fragmenter.surround = 150 print('Number of hits:', len(results)) hf = HtmlFormatter(tagname="span",classname="match", termclass="term") results._set_formatter(hf) hl_results = [hit.highlights("textdata") for hit in results] #for hit in results: # print(hit.highlights("textdata")) return hl_results
def search_query_page(ix, query_string, index_name, page=0, limits=None): result = {"result": [], "totalcount": 0} try: query_string = query_string LOG.debug("Query_string: %s", query_string) hf = HtmlFormatter(tagname="em", classname="match", termclass="term") results = yield search_index_page(ix, query_string, index_name, page, limits) results.results.formatter = hf results.results.fragmenter.charlimit = 100 * 1024 results.results.fragmenter.maxchars = 20 # results.results.fragmenter.surround = 5 results_len = 0 if results.results.has_exact_length(): results_len = len(results) LOG.debug("Have %s results:", results_len) results_len = len(results) result["totalcount"] = results_len LOG.debug("Have %s results:", results_len) results_num = 0 for hit in results: item = ThumbnailItem() results_num += 1 LOG.debug("Result: %s", results_num) fields = hit.fields() LOG.debug("Doc_id: %s", fields["doc_id"]) html = sqlite.get_html_by_id(fields["doc_id"], conn=DB.conn_html) title = hit.highlights("file_name", text=html.file_name[0:-5]) item.title = title if title.strip() != "" else html.file_name[0:-5] item.title = html.file_name item.excerpts = hit.highlights("file_content", top=5, text=html.file_content) item.url = "/view/html/%s" % html.sha1 item.date_time = html.updated_at item.description = html.updated_at[0:19] result["result"].append(item) yield gen.moment except Exception, e: LOG.exception(e) result = False
def __init__(self, whoosh_index_dir='', stopwords_file='', cache_host='localhost', cache_port=6379, **kwargs): Engine.__init__(self, **kwargs) self.whoosh_index_dir = whoosh_index_dir if not self.whoosh_index_dir: raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified") self.stopwords_file = stopwords_file if self.stopwords_file: self.stopwords = ListReader(self.stopwords_file) # Open the stopwords file, read into a ListReader else: raise EngineConnectionException(self.name, "'stopwords_file=' keyword argument not specified") self.scoring_model_identifier = 1 self.scoring_model = scoring.PL2(c=10.0) self.__verbose = False try: self.doc_index = open_dir(self.whoosh_index_dir) self.reader = self.doc_index.reader() self.parser = QueryParser('content', self.doc_index.schema) # By default, we use AND grouping. # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc... # Objects required for document snippet generation self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer self.fragmenter = ContextFragmenter(maxchars=200, surround=40) self.formatter = HtmlFormatter() except EmptyIndexError: message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir) raise EngineConnectionException(self.name, message) except OSError: message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir) raise EngineConnectionException(self.name, message) # Attempt to connect to the specified Redis cache. self.cache = RedisConn(host=cache_host, port=cache_port) self.cache.connect()
if n > max_limit: n = max_limit ranked_results = sorted(doc_scores.iteritems(), key=operator.itemgetter(1), reverse=True) print "{0} {1} {2}".format(query_num, len(ranked_results), n) if n > 0: for rank in range(n): trec_docid = reader.stored_fields(ranked_results[rank][0])['docid'] score_formatted = "{0:.6f}".format(ranked_results[rank][1]) output_file.write("{0} Q0 {1} {2} {3} Exp{4}".format(query_num, trec_docid, (rank + 1), score_formatted, os.linesep)) content = reader.stored_fields(ranked_results[rank][0])['content'] if isinstance(whoosh_query, str): termz = [unicode(whoosh_query)] else: termz = [text for fieldname, text in whoosh_query.all_terms() if fieldname == 'content'] from whoosh.highlight import highlight analyzer = ix.schema['content'].analyzer fragmenter = ContextFragmenter() formatter = HtmlFormatter() #print highlight(content, termz, analyzer, fragmenter, formatter) ix.close() input_file.close() output_file.close()
def search(self, collector, query_str1=None, query_str2=None, itemtypes=(), highlight=False): # rejects '*' and '?' if query_str1: for kw in (s.strip() for s in query_str1.split()): if not kw.replace("*", "").replace("?", "").strip(): return [] wildcard = (query_str1 and any(c in query_str1 for c in "*?")) parser = self._parser_wild if wildcard else self._parser asf_parser = self._asf_parser with self._index.searcher() as searcher: andlist = [] try: if query_str1: andlist.append(parser.parse(query_str1)) if query_str2: andlist.append(asf_parser.parse(query_str2)) except: return [] if itemtypes: if len(itemtypes) > 1: andlist.append(Or([Term('itemtype', t) for t in itemtypes])) else: andlist.append(Term('itemtype', itemtypes[0])) query = And(andlist) searcher.search_with_collector(query, collector) hits = collector.results() if highlight: hits.fragmenter = WholeFragmenter() hits.formatter = HtmlFormatter(tagname='span', classname='s_match', termclass='s_term') if wildcard and query_str1: pat = query_str1.replace("-", "").replace(" ", "") wildmatch = re.compile(fnmatch.translate(pat)) # Construct a result list results = [] for hit in hits: if collector.aborted: return [] (label, path, prio, sortkey) = hit['data'] if wildcard and query_str1: if not wildmatch.match(sortkey): continue if highlight: if query_str1: text = hit.highlights('content') else: text = hit['content'] else: text = None results.append((label, path, sortkey, prio, text)) sortkey_prio_getter = itemgetter(2, 3) results.sort(key=sortkey_prio_getter) # Return return results
def run_query(query, index): """ Queries the index for data with the given text query @param query The text query to perform on the indexed data @return A list of HTMl string snippets to return """ # Create a searcher object for this index searcher = index.searcher() # Create a query parser that will parse multiple fields of the documents field_boosts = {'content': 1.0, 'title': 3.0} query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup) # Build a query object from the query string query_object = query_parser.parse(query) # Build a spell checker in this index and add the "content" field to the spell checker spell_checker = SpellChecker(index.storage) spell_checker.add_field(index, 'content') spell_checker.add_field(index, 'title') # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results search_terms = [text for fieldname, text in query_object.all_terms()] # Remove terms that are too short for search_term in search_terms: if len(search_term) <= 3: search_terms.remove(search_term) # Perform the query itself search_results = searcher.search(query_object) # Get an analyzer for analyzing the content of each page for highlighting analyzer = index_schema['content'].format.analyzer # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts # by 'context' in the content fragmenter = ContextFragmenter(frozenset(search_terms)) # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to # highlight the results formatter = HtmlFormatter() # Iterate through the search results, highlighting and counting the results result_count = 0 results = [] for search_result in search_results: # Collect this search result results.append({ 'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter), 'url': search_result['url'], 'title': search_result['title'] }) result_count += 1 # Build a list of 'suggest' words using the spell checker suggestions = [] for term in search_terms: suggestions.append(spell_checker.suggest(term)) # Return the list of web pages along with the terms used in the search return results, search_terms, suggestions, result_count
ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() #INDEX SCHEMA DEFINITION SCHEMA = Schema(fileid=ID(unique=True), owner=TEXT(), repository=TEXT(stored=True), path=TEXT(stored=True), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(), extension=TEXT(stored=True)) IDX_NAME = 'HG_INDEX' FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') FRAGMENTER = ContextFragmenter(200) CHGSETS_SCHEMA = Schema( raw_id=ID(unique=True, stored=True), date=NUMERIC(stored=True), last=BOOLEAN(), owner=TEXT(), repository=ID(unique=True, stored=True), author=TEXT(stored=True), message=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), parents=TEXT(), added=TEXT(),
def highlight(self, text, words): fragmenter = ContextFragmenter() formatter = HtmlFormatter() analyzer = self.project_schema['text'].analyzer return highlight(text, words, analyzer, fragmenter, formatter, top=1)
def make_html_formatter(): # the html formatter cannot be shared between searches easily, thus # we create it in a factory. return HtmlFormatter(tagname='strong', between=u' <span class="elipsis">…</span> ')
def __init__(self, whoosh_index_dir='', use_cache=True, cache_host='localhost', cache_port=6379, **kwargs): """ Constructor for the engine. """ Engine.__init__(self, **kwargs) self.whoosh_index_dir = whoosh_index_dir if not self.whoosh_index_dir: raise EngineConnectionException( self.name, "'whoosh_index_dir=' keyword argument not specified") # Only put PL2 in for now (for more, add the model parameter to the constructor to specify!) self.scoring_model_identifier = 1 self.scoring_model = scoring.PL2(c=10.0) try: self.doc_index = open_dir(self.whoosh_index_dir) self.reader = self.doc_index.reader() self.parser = QueryParser( 'content', self.doc_index.schema) # By default, we use AND grouping. # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc... # Objects required for document snippet generation self.analyzer = self.doc_index.schema[ self.parser.fieldname].analyzer self.fragmenter = ContextFragmenter(maxchars=200, surround=40) self.formatter = HtmlFormatter() except EmptyIndexError: message = "Could not open Whoosh index at '{0}'".format( self.whoosh_index_dir) raise EngineConnectionException(self.name, message) except OSError: message = "Could not open Whoosh index at '{0}' - directory does not exist".format( self.whoosh_index_dir) raise EngineConnectionException(self.name, message) self.use_cache = use_cache if self.use_cache: self.cache = RedisConn(host=cache_host, port=cache_port) self.cache.connect() self.page_cache_forward_look = 40 # How many additional pages to cache when required. self.page_cache_when = 4 # When the user is x pages away from the end of the page cache, cache more pages. self.page_cache_controller = PageCacheController( cache_host=self.cache.host, cache_port=self.cache.port, whoosh_index=self.doc_index, scoring_model_identifier=self.scoring_model_identifier, parser=self.parser, analyzer=self.analyzer, fragmenter=self.fragmenter, formatter=self.formatter, cache_forward_look=self.page_cache_forward_look)