def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version): STORED = Field.Store.YES UN_INDEXED = Field.Index.NO UN_TOKENIZED = Field.Index.UN_TOKENIZED doc = Document() doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED)) doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED)) doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED)) doc.add(Field("version", str(version), STORED, UN_INDEXED)) reader = StringReader(reader.read()) doc.add(Field("contents", reader, Field.TermVector.YES)) indexWriter.addDocument(doc)
def main(cls, argv): query = TermQuery(Term("f", "ipsum")) scorer = QueryScorer(query) formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(50) highlighter.setTextFragmenter(fragmenter) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) tokenStream = analyzer.tokenStream("f", StringReader(cls.text)) result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...") stdout.write("<html>") stdout.write("<style>\n") stdout.write(".highlight {\n") stdout.write(" background: yellow\n") stdout.write("}\n") stdout.write("</style>") stdout.write("<body>") stdout.write(result) stdout.write("</body></html>\n") stdout.flush()
def displayTokens(cls, analyzer, text): tokenStream = analyzer.tokenStream("contents", StringReader(text)) term = tokenStream.addAttribute(TermAttribute.class_) while tokenStream.incrementToken(): print "[%s]" % (term.term()),
def testHighlighting(self): text = "The quick brown fox jumps over the lazy dog" query = TermQuery(Term("field", "fox")) scorer = QueryScorer(query) highlighter = Highlighter(scorer) tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream( "field", StringReader(text)) self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog", highlighter.getBestFragment(tokenStream, text))
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def assertAnalyzesTo(cls, analyzer, input, outputs): stream = analyzer.tokenStream("field", StringReader(input)) termAttr = stream.addAttribute(TermAttribute.class_) for output in outputs: if not stream.incrementToken(): raise AssertionError, 'stream.incremementToken()' if output != termAttr.term(): raise AssertionError, 'output == termAttr.term())' if stream.incrementToken(): raise AssertionError, 'not stream.incremementToken()' stream.close()
def dumpSpans(self, query): spans = query.getSpans(self.reader) print "%s:" % query numSpans = 0 scoreDocs = self.searcher.search(query, 50).scoreDocs scores = [0, 0] for scoreDoc in scoreDocs: scores[scoreDoc.doc] = scoreDoc.score while spans.next(): numSpans += 1 id = spans.doc() doc = self.reader.document(id) # for simplicity - assume tokens are in sequential, # positions, starting from 0 stream = self.analyzer.tokenStream("contents", StringReader(doc.get("f"))) term = stream.addAttribute(TermAttribute.class_) buffer = StringIO() buffer.write(" ") i = 0 while stream.incrementToken(): if i == spans.start(): buffer.write("<") buffer.write(term.term()) if i + 1 == spans.end(): buffer.write(">") buffer.write(" ") i += 1 buffer.write("(") buffer.write(str(scores[id])) buffer.write(") ") print buffer.getvalue() # print self.searcher.explain(query, id) if numSpans == 0: print " No spans" print ''
def displayTokensWithPositions(cls, analyzer, text): stream = analyzer.tokenStream("contents", StringReader(text)) term = stream.addAttribute(TermAttribute.class_) posIncr = stream.addAttribute(PositionIncrementAttribute.class_) position = 0 while stream.incrementToken(): increment = posIncr.getPositionIncrement() if increment > 0: position = position + increment print "\n%d:" % (position), print "[%s]" % (term.term()), print
def indexFile(self, writer, path): try: file = open(path) string = HTMLReader(InputStreamReader(file, 'utf-8')).read() file.close() except: raise else: doc = Document() doc.add(Field("contents", StringReader(string))) doc.add( Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) return doc
def testHits(self): searcher = IndexSearcher(self.directory, True) query = TermQuery(Term("title", "action")) scoreDocs = searcher.search(query, 50).scoreDocs scorer = QueryScorer(query) highlighter = Highlighter(scorer) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) title = doc["title"] stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream( "title", StringReader(title)) fragment = highlighter.getBestFragment(stream, title) print fragment
def displayTokensWithFullDetails(cls, analyzer, text): stream = analyzer.tokenStream("contents", StringReader(text)) term = stream.addAttribute(TermAttribute.class_) posIncr = stream.addAttribute(PositionIncrementAttribute.class_) offset = stream.addAttribute(OffsetAttribute.class_) type = stream.addAttribute(TypeAttribute.class_) position = 0 while stream.incrementToken(): increment = posIncr.getPositionIncrement() if increment > 0: position = position + increment print "\n%d:" % (position), print "[%s:%d->%d:%s]" % (term.term(), offset.startOffset(), offset.endOffset(), type.type()), print
def indexFile(self, writer, path): doc = Document() try: process = popen2.Popen4(["antiword", "-m", "UTF-8", path]) string = InputStreamReader(process.fromchild, 'utf-8').read() except: raise else: doc.add(Field("contents", StringReader(string))) doc.add(Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) exitCode = process.wait() if exitCode != 0: raise RuntimeError, "pdftotext exit code %d" %(exitCode) return doc
def testJumps(self): stream = self.synonymAnalyzer.tokenStream("contents", StringReader("jumps")) term = stream.addAttribute(TermAttribute.class_) posIncr = stream.addAttribute(PositionIncrementAttribute.class_) i = 0 expected = ["jumps", "hops", "leaps"] while stream.incrementToken(): self.assertEqual(expected[i], term.term()) if i == 0: expectedPos = 1 else: expectedPos = 0 self.assertEqual(expectedPos, posIncr.getPositionIncrement()) i += 1 self.assertEqual(3, i)
def indexFile(self, writer, path): doc = Document() try: process = popen2.Popen4(["pdfinfo", "-enc", "UTF-8", path]) except: raise else: while True: line = process.fromchild.readline().strip() if not line: break name, value = line.split(':', 1) doc.add( Field(name.strip(), value.strip(), Field.Store.YES, Field.Index.NOT_ANALYZED)) exitCode = process.wait() if exitCode != 0: raise RuntimeError, "pdfinfo exit code %d" % (exitCode) try: process = popen2.Popen4(["pdftotext", "-enc", "UTF-8", path, "-"]) string = InputStreamReader(process.fromchild, 'utf-8').read() except: raise else: doc.add(Field("contents", StringReader(string))) doc.add( Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) exitCode = process.wait() if exitCode != 0: raise RuntimeError, "pdftotext exit code %d" % (exitCode) return doc
def search(request): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() ret = {} maxLength = 38 search_content = request.GET.get('content') if len(search_content) > maxLength: pass query = QueryParser(Version.LUCENE_CURRENT, "contentKeyword", analyzer).parse(search_content) scoreDocs = searcher.search(query, 50).scoreDocs scorer = QueryScorer(query) formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(50) highlighter.setTextFragmenter(fragmenter) ret['NumOfDocs'] = str(len(scoreDocs)) + "total matching documents." print ret['NumOfDocs'] conn = pymysql.connect(host='localhost', user=user, password=password, db=db_name, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) rst = '' ret['search_list'] = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) _id = str(doc.get("id")) print _id sql = 'select * from webpage where id=%s' with conn.cursor() as cursor: cursor.execute(sql, (_id)) rst = cursor.fetchone() titleStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream( "title", StringReader(rst['title'])) titleFragment = highlighter.getBestFragment(titleStream, rst['title']) if titleFragment is None: titleFragment = rst['title'] contentStream = ChineseAnalyzer(Version.LUCENE_CURRENT).tokenStream( "content", StringReader(rst['content'])) contentFragment = highlighter.getBestFragments(contentStream, rst['content'], 5, '...') ret['search_list'].append({ 'title': titleFragment, 'url': rst['url'], 'content': contentFragment }) #searcher.close() conn.close() return render(request, 'tjut/result.html', { 'search_list': ret['search_list'], 'search_content': search_content })
# Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create a query query = queryParser.parse(QUERY_STRING) topDocs = searcher.search(query, 50) # Get top hits scoreDocs = topDocs.scoreDocs print "%s total matching documents." % len(scoreDocs) HighlightFormatter = SimpleHTMLFormatter() query_score = QueryScorer (query) highlighter = Highlighter(HighlightFormatter, query_score) # Set the fragment size. We break text in to fragment of 64 characters fragmenter = SimpleSpanFragmenter(query_score, 64) highlighter.setTextFragmenter(fragmenter) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) text = doc.get(FIELD_CONTENTS) ts = analyzer.tokenStream(FIELD_CONTENTS, StringReader(text)) print doc.get(FIELD_PATH) print highlighter.getBestFragments(ts, text, 3, "...") print ""