def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # Elwood document = Document() document.add( Field("owner", "elwood", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "elwoods sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) # Jake document = Document() document.add( Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "jakes sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) writer.close()
def init(): global searcher, analyzer, vm vm = initVM() STORE_DIR = "index_qst" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def init(): global STORE_DIR, directory, searcher, analyzer, vm_env STORE_DIR = "index_lucene_v3_highlight" if (vm_env == None): vm_env = initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
def Searchfile(command, prior, page, RPP): STORE_DIR = "index_ans" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) store = run(searcher, analyzer, command, prior) searcher.close() start = (page - 1) * RPP end = start + RPP return store[start:end], len(store)
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.optimize() writer.close()
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) self.addPoint(writer, "El Charro", "restaurant", 1, 2) self.addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9) self.addPoint(writer, "Los Betos", "restaurant", 9, 6) self.addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8) writer.close() self.searcher = IndexSearcher(self.directory, True) self.query = TermQuery(Term("type", "restaurant"))
def setUp(self): self.analyzer = WhitespaceAnalyzer() self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for i in xrange(1, 501): doc = Document() doc.add(Field("id", NumberUtils.pad(i), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.close()
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add( Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.optimize() writer.close() reader.close()
def _createIndex(self, inputDF, colname): """ function to create lucene index, iterates over inputDF row by row, and indexes the relevant column By default - WhitespaceAnalyzer is used, other Analyzers are also available. """ # Create index directory directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) # Inline indexing of column data inputDF.apply(lambda x: self._addDoc(x[colname], writer), axis=1) # Optimize, close and return writer.optimize() writer.close() return directory
def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add( Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True)
def testAnalyzer(self): analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryString = "category:/philosophy/eastern" parser = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer) parser.setAutoGeneratePhraseQueries(True) query = parser.parse(queryString) self.assertEqual("category:\"philosophy eastern\"", query.toString("contents"), "path got split, yikes!") perFieldAnalyzer = PerFieldAnalyzerWrapper(analyzer) perFieldAnalyzer.addAnalyzer("category", WhitespaceAnalyzer()) query = QueryParser(Version.LUCENE_CURRENT, "contents", perFieldAnalyzer).parse(queryString) self.assertEqual("category:/philosophy/eastern", query.toString("contents"), "leave category field alone")
class AnalyzerDemo(object): examples = [ "The quick brown fox jumped over the lazy dogs", "XY&Z Corporation - [email protected]" ] analyzers = [ WhitespaceAnalyzer(), SimpleAnalyzer(), StopAnalyzer(Version.LUCENE_CURRENT), StandardAnalyzer(Version.LUCENE_CURRENT) ] def main(cls, argv): # Use the embedded example strings, unless # command line arguments are specified, then use those. strings = cls.examples if len(argv) > 1: strings = argv[1:] for string in strings: cls.analyze(string) def analyze(cls, text): print 'Analyzing "%s"' % (text) for analyzer in cls.analyzers: name = type(analyzer).__name__ print " %s:" % (name), AnalyzerUtils.displayTokens(analyzer, text) print print main = classmethod(main) analyze = classmethod(analyze)
def setUp(self): animals = [ "aardvark", "beaver", "coati", "dog", "elephant", "frog", "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur", "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion", "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak", "zebra" ] analyzer = WhitespaceAnalyzer() aTOmDirectory = RAMDirectory() nTOzDirectory = RAMDirectory() aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) for animal in animals: doc = Document() doc.add( Field("animal", animal, Field.Store.YES, Field.Index.NOT_ANALYZED)) if animal[0].lower() < "n": aTOmWriter.addDocument(doc) else: nTOzWriter.addDocument(doc) aTOmWriter.close() nTOzWriter.close() self.searchers = [ IndexSearcher(aTOmDirectory), IndexSearcher(nTOzDirectory) ]
class AnalyzerDemo(object): examples = ["http://www.baidu.com/ www.baidu.com", "联系 本站 版权 所有 上海 交通 大学BBS 饮水思源 站 沪ICP备020861".decode('gbk')] analyzers = [WhitespaceAnalyzer(), SimpleAnalyzer(), StopAnalyzer(Version.LUCENE_CURRENT), StandardAnalyzer(Version.LUCENE_CURRENT), CJKAnalyzer(Version.LUCENE_CURRENT)] def main(cls, argv): # Use the embedded example strings, unless # command line arguments are specified, then use those. strings = cls.examples if len(argv) > 1: strings = argv[1:] for string in strings: cls.analyze(string) def analyze(cls, text): print 'Analyzing "%s"' %(text) for analyzer in cls.analyzers: name = type(analyzer).__name__ print " %s:" %(name), AnalyzerUtils.displayTokens(analyzer, text) print print main = classmethod(main) analyze = classmethod(analyze)
class SpanQueryTest(TestCase): def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat")) def assertOnlyBrownFox(self, query): topDocs = self.searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits) self.assertEqual(0, topDocs.scoreDocs[0].doc, "wrong doc") def assertBothFoxes(self, query): topDocs = self.searcher.search(query, 50) self.assertEqual(2, topDocs.totalHits) def assertNoMatches(self, query): topDocs = self.searcher.search(query, 50) self.assertEquals(0, topDocs.totalHits) def testSpanTermQuery(self): self.assertOnlyBrownFox(self.brown) self.dumpSpans(self.brown) def testSpanFirstQuery(self): sfq = SpanFirstQuery(self.brown, 2) self.assertNoMatches(sfq) self.dumpSpans(sfq) sfq = SpanFirstQuery(self.brown, 3) self.dumpSpans(sfq) self.assertOnlyBrownFox(sfq) def testSpanNearQuery(self): quick_brown_dog = [self.quick, self.brown, self.dog] snq = SpanNearQuery(quick_brown_dog, 0, True) self.assertNoMatches(snq) self.dumpSpans(snq) snq = SpanNearQuery(quick_brown_dog, 4, True) self.assertNoMatches(snq) self.dumpSpans(snq) snq = SpanNearQuery(quick_brown_dog, 5, True) self.assertOnlyBrownFox(snq) self.dumpSpans(snq) # interesting - even a sloppy phrase query would require # more slop to match snq = SpanNearQuery([self.lazy, self.fox], 3, False) self.assertOnlyBrownFox(snq) self.dumpSpans(snq) pq = PhraseQuery() pq.add(Term("f", "lazy")) pq.add(Term("f", "fox")) pq.setSlop(4) self.assertNoMatches(pq) pq.setSlop(5) self.assertOnlyBrownFox(pq) def testSpanNotQuery(self): quick_fox = SpanNearQuery([self.quick, self.fox], 1, True) self.assertBothFoxes(quick_fox) self.dumpSpans(quick_fox) quick_fox_dog = SpanNotQuery(quick_fox, self.dog) self.assertBothFoxes(quick_fox_dog) self.dumpSpans(quick_fox_dog) no_quick_red_fox = SpanNotQuery(quick_fox, self.red) self.assertOnlyBrownFox(no_quick_red_fox) self.dumpSpans(no_quick_red_fox) def testSpanOrQuery(self): quick_fox = SpanNearQuery([self.quick, self.fox], 1, True) lazy_dog = SpanNearQuery([self.lazy, self.dog], 0, True) sleepy_cat = SpanNearQuery([self.sleepy, self.cat], 0, True) qf_near_ld = SpanNearQuery([quick_fox, lazy_dog], 3, True) self.assertOnlyBrownFox(qf_near_ld) self.dumpSpans(qf_near_ld) qf_near_sc = SpanNearQuery([quick_fox, sleepy_cat], 3, True) self.dumpSpans(qf_near_sc) orQ = SpanOrQuery([qf_near_ld, qf_near_sc]) self.assertBothFoxes(orQ) self.dumpSpans(orQ) def testPlay(self): orQ = SpanOrQuery([self.quick, self.fox]) self.dumpSpans(orQ) quick_fox = SpanNearQuery([self.quick, self.fox], 1, True) sfq = SpanFirstQuery(quick_fox, 4) self.dumpSpans(sfq) self.dumpSpans(SpanTermQuery(Term("f", "the"))) quick_brown = SpanNearQuery([self.quick, self.brown], 0, False) self.dumpSpans(quick_brown) def dumpSpans(self, query): spans = query.getSpans(self.reader) print "%s:" % query numSpans = 0 scoreDocs = self.searcher.search(query, 50).scoreDocs scores = [0, 0] for scoreDoc in scoreDocs: scores[scoreDoc.doc] = scoreDoc.score while spans.next(): numSpans += 1 id = spans.doc() doc = self.reader.document(id) # for simplicity - assume tokens are in sequential, # positions, starting from 0 stream = self.analyzer.tokenStream("contents", StringReader(doc.get("f"))) term = stream.addAttribute(TermAttribute.class_) buffer = StringIO() buffer.write(" ") i = 0 while stream.incrementToken(): if i == spans.start(): buffer.write("<") buffer.write(term.term()) if i + 1 == spans.end(): buffer.write(">") buffer.write(" ") i += 1 buffer.write("(") buffer.write(str(scores[id])) buffer.write(") ") print buffer.getvalue() # print self.searcher.explain(query, id) if numSpans == 0: print " No spans" print ''
def init(): global STORE_DIR,directory,searcher,analyzer STORE_DIR = "image_index_v3" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
def getAnalyzer(self): return WhitespaceAnalyzer()
def setUp(self): super(QueryParserTest, self).setUp() self.analyzer = WhitespaceAnalyzer() self.searcher = IndexSearcher(self.directory, True)
class SpanQueryTest(TestCase): def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat")) def assertOnlyBrownFox(self, query): topDocs = self.searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits) self.assertEqual(0, topDocs.scoreDocs[0].doc, "wrong doc") def assertBothFoxes(self, query): topDocs = self.searcher.search(query, 50) self.assertEqual(2, topDocs.totalHits) def assertNoMatches(self, query): topDocs = self.searcher.search(query, 50) self.assertEquals(0, topDocs.totalHits) def testSpanTermQuery(self): self.assertOnlyBrownFox(self.brown) self.dumpSpans(self.brown) def testSpanFirstQuery(self): sfq = SpanFirstQuery(self.brown, 2) self.assertNoMatches(sfq) self.dumpSpans(sfq) sfq = SpanFirstQuery(self.brown, 3) self.dumpSpans(sfq) self.assertOnlyBrownFox(sfq) def testSpanNearQuery(self): quick_brown_dog = [self.quick, self.brown, self.dog] snq = SpanNearQuery(quick_brown_dog, 0, True) self.assertNoMatches(snq) self.dumpSpans(snq) snq = SpanNearQuery(quick_brown_dog, 4, True) self.assertNoMatches(snq) self.dumpSpans(snq) snq = SpanNearQuery(quick_brown_dog, 5, True) self.assertOnlyBrownFox(snq) self.dumpSpans(snq) # interesting - even a sloppy phrase query would require # more slop to match snq = SpanNearQuery([self.lazy, self.fox], 3, False) self.assertOnlyBrownFox(snq) self.dumpSpans(snq) pq = PhraseQuery() pq.add(Term("f", "lazy")) pq.add(Term("f", "fox")) pq.setSlop(4) self.assertNoMatches(pq) pq.setSlop(5) self.assertOnlyBrownFox(pq) def testSpanNotQuery(self): quick_fox = SpanNearQuery([self.quick, self.fox], 1, True) self.assertBothFoxes(quick_fox) self.dumpSpans(quick_fox) quick_fox_dog = SpanNotQuery(quick_fox, self.dog) self.assertBothFoxes(quick_fox_dog) self.dumpSpans(quick_fox_dog) no_quick_red_fox = SpanNotQuery(quick_fox, self.red) self.assertOnlyBrownFox(no_quick_red_fox) self.dumpSpans(no_quick_red_fox) def testSpanOrQuery(self): quick_fox = SpanNearQuery([self.quick, self.fox], 1, True) lazy_dog = SpanNearQuery([self.lazy, self.dog], 0, True) sleepy_cat = SpanNearQuery([self.sleepy, self.cat], 0, True) qf_near_ld = SpanNearQuery([quick_fox, lazy_dog], 3, True) self.assertOnlyBrownFox(qf_near_ld) self.dumpSpans(qf_near_ld) qf_near_sc = SpanNearQuery([quick_fox, sleepy_cat], 3, True) self.dumpSpans(qf_near_sc) orQ = SpanOrQuery([qf_near_ld, qf_near_sc]) self.assertBothFoxes(orQ) self.dumpSpans(orQ) def testPlay(self): orQ = SpanOrQuery([self.quick, self.fox]) self.dumpSpans(orQ) quick_fox = SpanNearQuery([self.quick, self.fox], 1, True) sfq = SpanFirstQuery(quick_fox, 4) self.dumpSpans(sfq) self.dumpSpans(SpanTermQuery(Term("f", "the"))) quick_brown = SpanNearQuery([self.quick, self.brown], 0, False) self.dumpSpans(quick_brown) def dumpSpans(self, query): spans = query.getSpans(self.reader) print "%s:" % query numSpans = 0 scoreDocs = self.searcher.search(query, 50).scoreDocs scores = [0, 0] for scoreDoc in scoreDocs: scores[scoreDoc.doc] = scoreDoc.score while spans.next(): numSpans += 1 id = spans.doc() doc = self.reader.document(id) # for simplicity - assume tokens are in sequential, # positions, starting from 0 stream = self.analyzer.tokenStream("contents", StringReader(doc.get("f"))) term = stream.addAttribute(TermAttribute.class_) buffer = StringIO() buffer.write(" ") i = 0 while stream.incrementToken(): if i == spans.start(): buffer.write("<") buffer.write(term.term()) if i + 1 == spans.end(): buffer.write(">") buffer.write(" ") i += 1 buffer.write("(") buffer.write(str(scores[id])) buffer.write(") ") print buffer.getvalue() # print self.searcher.explain(query, id) if numSpans == 0: print " No spans" print ""
Field.Index.NOT_ANALYZED)) domin = urlparse.urlsplit(url)[1].split(':')[0] doc.add( Field("site", domin, Field.Store.YES, Field.Index.NOT_ANALYZED)) title = files[filename][1] doc.add( Field("title", title, Field.Store.YES, Field.Index.NOT_ANALYZED)) print filename, path, url, domin, title if len(contents) > 0: doc.add( Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED)) else: print("warning: no content in %s" % filename) writer.addDocument(doc) # except Exception, e: # print "Failed in indexDocs:", e if __name__ == '__main__': #lucene.initVM() # lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION start = datetime.now() IndexFiles('ori_txt', "index_lucene_v3_highlight", WhitespaceAnalyzer(Version.LUCENE_CURRENT)) end = datetime.now() print end - start
def index(cls, indexDir, taxoDir): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer ver = lucene.Version.LUCENE_35 config = IndexWriterConfig(ver, WhitespaceAnalyzer(ver)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # obtain the sample facets for current document facets = categories[docNum] facetList = [createCategoryPath(f) for f in facets] # NOTE: setCategoryPaths() requires an Iterable, so need to convert the # Python list in order to to pass a proper argument to setCategoryPaths. # We use java.util.Arrays (via JCC) to create a Java List. # see http://docs.oracle.com/javase/1.5.0/docs/api/java/util/Arrays.html#asList(T...) facetList = lucene.Arrays.asList(facetList) # NOTE: we could use lucene.collections here as well in order to convert our # Python list to a Java based list using the JavaList class (JavaList implements # java.util.List around a Python list instance it wraps): # from lucene.collections import JavaList # facetList = JavaList(facetList) # we do not alter indexing parameters # a category document builder will add the categories to a document once build() is called categoryDocBuilder = CategoryDocumentBuilder( taxo).setCategoryPaths(facetList) # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add( Field(TITLE, docTitles[docNum], Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field(TEXT, docTexts[docNum], Field.Store.NO, Field.Index.ANALYZED)) # invoke the category document builder for adding categories to the document and, # as required, to the taxonomy index categoryDocBuilder.build(doc) # finally add the document to the index iw.addDocument(doc) nDocsAdded += 1 nFacetsAdded += facetList.size() # end for # commit changes. # we commit changes to the taxonomy index prior to committing them to the search index. # this is important, so that all facets referred to by documents in the search index # will indeed exist in the taxonomy index. taxo.commit() iw.commit() # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. taxo.close() iw.close() print "Indexed %d documents with overall %d facets." % (nDocsAdded, nFacetsAdded)
ls = line.split('seg^*') url = ls[0] title = ls[1] src = ls[2] alt = ls[3] picDict[src] = [url,title,alt] f.close() for src in picDict: doc = Document() doc.add(Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", picDict[src][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("title", picDict[src][1], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("alt", picDict[src][2], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION start = datetime.now() IndexFiles('txt', "image_index_v3", WhitespaceAnalyzer(Version.LUCENE_CURRENT)) end = datetime.now() print end - start