def testHighlighting(self): text = "The quick brown fox jumps over the lazy dog" query = TermQuery(Term("field", "fox")) scorer = QueryScorer(query) highlighter = Highlighter(scorer) tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream("field", StringReader(text)) self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog", highlighter.getBestFragment(tokenStream, text))
def testHighlighting(self): text = "The quick brown fox jumps over the lazy dog" query = TermQuery(Term("field", "fox")) scorer = QueryScorer(query) highlighter = Highlighter(scorer) tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream( "field", StringReader(text)) self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog", highlighter.getBestFragment(tokenStream, text))
def testHits(self): searcher = self.getSearcher() query = TermQuery(Term("title", "action")) scoreDocs = searcher.search(query, 50).scoreDocs scorer = QueryScorer(query) highlighter = Highlighter(scorer) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) title = doc["title"] stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream("title", StringReader(title)) fragment = highlighter.getBestFragment(stream, title) print fragment
def testHits(self): searcher = IndexSearcher(self.directory, True) query = TermQuery(Term("title", "action")) scoreDocs = searcher.search(query, 50).scoreDocs scorer = QueryScorer(query) highlighter = Highlighter(scorer) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) title = doc["title"] stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream( "title", StringReader(title)) fragment = highlighter.getBestFragment(stream, title) print fragment
def run(command,pageindex=1,pagesize=15): global searcher, analyzer,old_command,old_res_list global STORE_DIR,directory,searcher,analyzer if command == '': return print "Searching for:", command querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): if(k=='site'): t = Term('url','*'+v.strip()+'*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 4000).scoreDocs print "%s total matching documents." % len(scoreDocs) res_list = [] simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT,"lrc",analyzer).parse(command_dict['content']) hlter = Highlighter(simpleHTMLFormatter,QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(200)) start = (pageindex-1)*pagesize end = start+pagesize print start,end for scoreDoc in scoreDocs[start:end+10]: doc = searcher.doc(scoreDoc.doc) res = [] res.append(doc.get('url')) res.append(doc.get('music_name')) res.append(doc.get('artist')) res.append(doc.get('album_name')) res.append(doc.get('lrc')) output = hlter.getBestFragment(analyzer,"lrc",clear(doc.get('lrc'))) res.append(output) res.append(doc.get('musicID')) if(res[5]!=None): res_list.append(res) if(len(res_list)==8): break return res_list,len(scoreDocs)
def run(command, pageindex=1, pagesize=15): global searcher, analyzer, old_command, old_res_list global STORE_DIR, directory, searcher, analyzer if command == '': return print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): if (k == 'site'): t = Term('url', '*' + v.strip() + '*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 4000).scoreDocs print "%s total matching documents." % len(scoreDocs) res_list = [] simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(200)) start = (pageindex - 1) * pagesize end = start + pagesize for scoreDoc in scoreDocs[start:end + 1]: doc = searcher.doc(scoreDoc.doc) res = [] res.append(doc.get('title')) res.append(doc.get('url')) output = hlter.getBestFragment(analyzer, "contents", clear(doc.get('contents'))) res.append(output) res_list.append(res) return res_list, len(scoreDocs)
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): if (k == 'site'): t = Term('url', '*' + v.strip() + '*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(500)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------' #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site') print 'title:', doc.get('title'), print 'url:', doc.get('url') ori_text = clear(doc.get('contents')) output = hlter.getBestFragment(analyzer, "contents", ori_text) print output
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k,v in command_dict.iteritems(): if(k=='site'): t = Term('url','*'+v.strip()+'*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k,analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter,QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(500)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------' #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site') print 'title:',doc.get('title'), print 'url:',doc.get('url') ori_text = clear(doc.get('contents')) output = hlter.getBestFragment(analyzer,"contents",ori_text) print output
def search(request): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() ret = {} maxLength = 38 search_content = request.GET.get('content') if len(search_content) > maxLength: pass query = QueryParser(Version.LUCENE_CURRENT, "contentKeyword", analyzer).parse(search_content) scoreDocs = searcher.search(query, 50).scoreDocs scorer = QueryScorer(query) formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(50) highlighter.setTextFragmenter(fragmenter) ret['NumOfDocs'] = str(len(scoreDocs)) + "total matching documents." print ret['NumOfDocs'] conn = pymysql.connect(host='localhost', user=user, password=password, db=db_name, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) rst = '' ret['search_list'] = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) _id = str(doc.get("id")) print _id sql = 'select * from webpage where id=%s' with conn.cursor() as cursor: cursor.execute(sql, (_id)) rst = cursor.fetchone() titleStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream( "title", StringReader(rst['title'])) titleFragment = highlighter.getBestFragment(titleStream, rst['title']) if titleFragment is None: titleFragment = rst['title'] contentStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream( "content", StringReader(rst['content'])) contentFragment = highlighter.getBestFragments(contentStream, rst['content'], 5, '...') ret['search_list'].append({ 'title': titleFragment, 'url': rst['url'], 'content': contentFragment }) #searcher.close() conn.close() return render(request, 'tjut/result.html', { 'search_list': ret['search_list'], 'search_content': search_content })