def main(cls, argv): query = TermQuery(Term("f", "ipsum")) scorer = QueryScorer(query) formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(50) highlighter.setTextFragmenter(fragmenter) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) tokenStream = analyzer.tokenStream("f", StringReader(cls.text)) result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...") stdout.write("<html>") stdout.write("<style>\n") stdout.write(".highlight {\n") stdout.write(" background: yellow\n") stdout.write("}\n") stdout.write("</style>") stdout.write("<body>") stdout.write(result) stdout.write("</body></html>\n") stdout.flush()
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def run(command, pageindex=1, pagesize=15): global searcher, analyzer, old_command, old_res_list global STORE_DIR, directory, searcher, analyzer if command == '': return print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): if (k == 'site'): t = Term('url', '*' + v.strip() + '*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 4000).scoreDocs print "%s total matching documents." % len(scoreDocs) res_list = [] simpleHTMLFormatter = SimpleHTMLFormatter("<font_forblank_color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(200)) start = (pageindex - 1) * pagesize end = start + pagesize for scoreDoc in scoreDocs[start:end + 1]: doc = searcher.doc(scoreDoc.doc) res = [] res.append(doc.get('title')) res.append(doc.get('url')) output = hlter.getBestFragment(analyzer, "contents", clear(doc.get('contents'))) res.append(output) res_list.append(res) return res_list, len(scoreDocs)
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'UTF-8') if command == '': return print print "Searching for:", command #朱莉与茱莉娅 # final = jieba.cut(command) # query = QueryParser(Version.LUCENE_CURRENT, "contents", # analyzer).parse(' '.join(final)) querys = BooleanQuery() command_dict = parseCommand(command) for k, v in command_dict.iteritems(): if (k == 'site'): t = Term('url', '*' + v.strip() + '*') query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) simpleHTMLFormatter = SimpleHTMLFormatter("<font color='red'>", "</font>") queryToHigh = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command_dict['contents']) hlter = Highlighter(simpleHTMLFormatter, QueryScorer(queryToHigh)) hlter.setTextFragmenter(SimpleFragmenter(500)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print '------------------------------------------' #print 'path:', doc.get("path"), 'name:', doc.get("name"),'site:', doc.get('site') print 'title:', doc.get('title'), print 'url:', doc.get('url') ori_text = clear(doc.get('contents')) output = hlter.getBestFragment(analyzer, "contents", ori_text) print output
# Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create a query query = queryParser.parse(QUERY_STRING) topDocs = searcher.search(query, 50) # Get top hits scoreDocs = topDocs.scoreDocs print "%s total matching documents." % len(scoreDocs) HighlightFormatter = SimpleHTMLFormatter() query_score = QueryScorer (query) highlighter = Highlighter(HighlightFormatter, query_score) # Set the fragment size. We break text in to fragment of 64 characters fragmenter = SimpleSpanFragmenter(query_score, 64) highlighter.setTextFragmenter(fragmenter) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) text = doc.get(FIELD_CONTENTS) ts = analyzer.tokenStream(FIELD_CONTENTS, StringReader(text)) print doc.get(FIELD_PATH) print highlighter.getBestFragments(ts, text, 3, "...") print ""
def search(request): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() ret = {} maxLength = 38 search_content = request.GET.get('content') if len(search_content) > maxLength: pass query = QueryParser(Version.LUCENE_CURRENT, "contentKeyword", analyzer).parse(search_content) scoreDocs = searcher.search(query, 50).scoreDocs scorer = QueryScorer(query) formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(50) highlighter.setTextFragmenter(fragmenter) ret['NumOfDocs'] = str(len(scoreDocs)) + "total matching documents." print ret['NumOfDocs'] conn = pymysql.connect(host='localhost', user=user, password=password, db=db_name, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) rst = '' ret['search_list'] = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) _id = str(doc.get("id")) print _id sql = 'select * from webpage where id=%s' with conn.cursor() as cursor: cursor.execute(sql, (_id)) rst = cursor.fetchone() titleStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream( "title", StringReader(rst['title'])) titleFragment = highlighter.getBestFragment(titleStream, rst['title']) if titleFragment is None: titleFragment = rst['title'] contentStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream( "content", StringReader(rst['content'])) contentFragment = highlighter.getBestFragments(contentStream, rst['content'], 5, '...') ret['search_list'].append({ 'title': titleFragment, 'url': rst['url'], 'content': contentFragment }) #searcher.close() conn.close() return render(request, 'tjut/result.html', { 'search_list': ret['search_list'], 'search_content': search_content })