def testQueryParser(self): searcher = IndexSearcher(self.directory, True) query = QueryParser(Version.LUCENE_CURRENT, "contents", SimpleAnalyzer()).parse("+JUNIT +ANT -MOCK") scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs)) d = searcher.doc(scoreDocs[0].doc) self.assertEqual("Java Development with Ant", d.get("title")) query = QueryParser(Version.LUCENE_CURRENT, "contents", SimpleAnalyzer()).parse("mock OR junit") scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(2, len(scoreDocs), "JDwA and JIA")
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), 'index-dir') dir = FSDirectory.getDirectory(indexDir, True) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = System.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add( Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.TOKENIZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def addDocuments(self, dir): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs # for word in self.docs: doc = Document() doc.add( Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add( Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def testWriteLock(self): writer1 = IndexWriter(self.dir, SimpleAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED) writer2 = None try: try: writer2 = IndexWriter(self.dir, SimpleAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED) self.fail("We should never reach this point") except: pass finally: writer1.close() self.assert_(writer2 is None)
def testBasicQueryParser(self): analyzer = SimpleAnalyzer() query = QueryParser(Version.LUCENE_CURRENT, "description", analyzer).parse("partnum:Q36 AND SPACE") scoreDocs = self.searcher.search(query, 50).scoreDocs self.assertEqual("+partnum:q +space", query.toString("description"), "note Q36 -> q") self.assertEqual(0, len(scoreDocs), "doc not found :(")
def main(cls, argv): print "SimpleAnalyzer" cls.displayTokensWithFullDetails(SimpleAnalyzer(), "The quick brown fox....") print "\n----" print "StandardAnalyzer" cls.displayTokensWithFullDetails( StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at [email protected]")
def testPerFieldAnalyzer(self): analyzer = PerFieldAnalyzerWrapper(SimpleAnalyzer()) analyzer.addAnalyzer("partnum", KeywordAnalyzer()) query = QueryParser(Version.LUCENE_CURRENT, "description", analyzer).parse("partnum:Q36 AND SPACE") scoreDocs = self.searcher.search(query, 50).scoreDocs #self.assertEqual("+partnum:Q36 +space", query.toString("description")) self.assertEqual(1, len(scoreDocs), "doc found!")
def testHighlighting(self): text = "The quick brown fox jumps over the lazy dog" query = TermQuery(Term("field", "fox")) scorer = QueryScorer(query) highlighter = Highlighter(scorer) tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream( "field", StringReader(text)) self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog", highlighter.getBestFragment(tokenStream, text))
def testSpecifiedOperator(self): MUST = BooleanClause.Occur.MUST query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, "development", ["title", "subject"], [MUST, MUST], SimpleAnalyzer()) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(query, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Java Development with Ant") self.assertEqual(1, len(scoreDocs), "one and only one")
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True)
def testDefaultOperator(self): SHOULD = BooleanClause.Occur.SHOULD query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, "development", ["title", "subject"], [SHOULD, SHOULD], SimpleAnalyzer()) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(query, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Java Development with Ant") # has "development" in the subject field self.assertHitsIncludeTitle(searcher, scoreDocs, "Extreme Programming Explained")
def index(self): dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "verbose-index") dir = FSDirectory.getDirectory(dirPath, True) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(System.out) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.UN_TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close()
def testHits(self): searcher = IndexSearcher(self.directory, True) query = TermQuery(Term("title", "action")) scoreDocs = searcher.search(query, 50).scoreDocs scorer = QueryScorer(query) highlighter = Highlighter(scorer) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) title = doc["title"] stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream( "title", StringReader(title)) fragment = highlighter.getBestFragment(stream, title) print fragment
def addDocuments(self, dir, maxFieldLength): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength(maxFieldLength)) for keyword, unindexed, unstored, text in \ izip(self.keywords, self.unindexed, self.unstored, self.text): doc = Document() doc.add(Field("id", keyword, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", unindexed, Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", unstored, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
class AnalyzerDemo(object): examples = [ "The quick brown fox jumped over the lazy dogs", "XY&Z Corporation - [email protected]" ] analyzers = [ WhitespaceAnalyzer(), SimpleAnalyzer(), StopAnalyzer(Version.LUCENE_CURRENT), StandardAnalyzer(Version.LUCENE_CURRENT) ] def main(cls, argv): # Use the embedded example strings, unless # command line arguments are specified, then use those. strings = cls.examples if len(argv) > 1: strings = argv[1:] for string in strings: cls.analyze(string) def analyze(cls, text): print 'Analyzing "%s"' % (text) for analyzer in cls.analyzers: name = type(analyzer).__name__ print " %s:" % (name), AnalyzerUtils.displayTokens(analyzer, text) print print main = classmethod(main) analyze = classmethod(analyze)
def main(cls, argv): if len(argv) != 3: print "Usage: Explainer <index dir> <query>" else: indexDir = argv[1] queryExpression = argv[2] directory = SimpleFSDirectory(File(indexDir)) query = QueryParser(Version.LUCENE_CURRENT, "contents", SimpleAnalyzer()).parse(queryExpression) print "Query:", queryExpression searcher = IndexSearcher(directory) scoreDocs = searcher.search(query, 50).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) explanation = searcher.explain(query, scoreDoc.doc) print "----------" print doc["title"].encode('utf-8') print explanation
class AnalyzerDemo(object): examples = ["http://www.baidu.com/ www.baidu.com", "联系 本站 版权 所有 上海 交通 大学BBS 饮水思源 站 沪ICP备020861".decode('gbk')] analyzers = [WhitespaceAnalyzer(), SimpleAnalyzer(), StopAnalyzer(Version.LUCENE_CURRENT), StandardAnalyzer(Version.LUCENE_CURRENT), CJKAnalyzer(Version.LUCENE_CURRENT)] def main(cls, argv): # Use the embedded example strings, unless # command line arguments are specified, then use those. strings = cls.examples if len(argv) > 1: strings = argv[1:] for string in strings: cls.analyze(string) def analyze(cls, text): print 'Analyzing "%s"' %(text) for analyzer in cls.analyzers: name = type(analyzer).__name__ print " %s:" %(name), AnalyzerUtils.displayTokens(analyzer, text) print print main = classmethod(main) analyze = classmethod(analyze)
def getAnalyzer(self): return SimpleAnalyzer()