def __init__(self, indexDir): self.directory = SimpleFSDirectory(Paths.get(indexDir)) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) self.nameQueryParser = QueryParser('name', StandardAnalyzer()) self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND) self.idQueryParser = QueryParser('id', StandardAnalyzer()) self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
def GET(self): user_data = web.input() message = user_data.keyword if len(message) > 10: if (len(message) > 3 and message[-3] + message[-2] + message[-1] == 'png' or message[-3] + message[-2] + message[-1] == 'jpg'): urlretrieve(message, 'target.jpg') lis1 = shit.LSH('target.jpg') lis = [] vm_env.attachCurrentThread() STORE_DIR = 'index' directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) for i in range(len(lis1)): lis.append(run(searcher, analyzer, lis1[i])[0]) else: a = func(user_data.keyword) STORE_DIR = 'index' vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) lis = run(searcher, analyzer, a) f = login return render.movies(f, lis)
def search(self, field: str): sear = self._search if len(self._commandInfo.getKey()) == 0 or self._commandInfo.getKey()[0] in ['-', '~']: query = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) elif self._commandInfo.getKey()[0] == '#': query1 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[0])) query2 = QueryParser(field, StandardAnalyzer()).parse(make_parser(self._commandInfo.getWordList()[1])) bc1 = BooleanClause(query1, BooleanClause.Occur.MUST) bc2 = BooleanClause(query2, BooleanClause.Occur.MUST) query = BooleanQuery.Builder().add(bc1).add(bc2).build() elif self._commandInfo.getKey()[0] in ['$', '+']: bq = BooleanQuery.Builder() for w in self._commandInfo.getWordList(): queryx = QueryParser(field, StandardAnalyzer()).parse(make_parser(w)) bc = BooleanClause(queryx, BooleanClause.Occur.MUST) bq.add(bc) query = bq.build() else: query = '' hits = sear.search(query, 999999) for hit in hits.scoreDocs: doc = sear.doc(hit.doc) res = doc.get(field) id = doc.get(field+'_id') if doc_hit(res, self._commandInfo): sentences = re.split('[!?!?。]', res) map(lambda x: sentences.pop(x) if x == '' else 0, range(len(sentences))) for sentence in sentences: if key_filter(self._commandInfo, sentence): self._doc[id] = res self._resultSentencesList.append((id, sentence)) return self
def __init__(self, fs_directory): directory = SimpleFSDirectory(Paths.get(fs_directory)) self.index_reader = DirectoryReader.open(directory) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.analyzer = StandardAnalyzer() self.query = None self.lucene_dictionary = LuceneDictionary(self.index_reader, 'contents') self.analyzer = StandardAnalyzer() self.formatter = SimpleHTMLFormatter() self.hits = None
def retrival_answer(MAX): lucene.initVM() directory = RAMDirectory() indexDir = SimpleFSDirectory(Paths.get('index')) writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(directory, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from Document..." process_doc = open("Huawei_result/document.txt", "r") doc_line = process_doc.readlines() for l in doc_line: doc = Document() doc.add(TextField("text", l, Field.Store.YES)) writer.addDocument(doc) print "Indexed from %d docs in index" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close() accuracy = [] process_query = open("Huawei_result/query.txt", "r") query_line = process_query.readlines() for n, one_query in enumerate(query_line): analyzer = StandardAnalyzer() # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index'))) searcher = IndexSearcher(DirectoryReader.open(directory)) # searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(one_query) hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) # print "The groundtruth document is:", doc_line[n] candidate_doc = [] for hit in hits.scoreDocs: # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) # print doc.get("text").encode("utf-8") candidate_doc.append(doc.get("text")) choices = process.extract(unicode(doc_line[n]), candidate_doc) flag = 0 for i in range(len(choices)): if choices[i][1] >= 89: flag = 1 if flag == 1: accuracy.append(1) else: accuracy.append(0) final_accuracy = float(sum(accuracy)) / float(len(accuracy)) print "the final accuracy is:", final_accuracy
def testOverrideBooleanQuery(self): class TestQueryParser(BooleanTestMixin, PythonMultiFieldQueryParser): def getFieldQuery_quoted(_self, field, queryText, quoted): return super(TestQueryParser, _self).getFieldQuery_quoted_super( field, queryText, quoted) qp = TestQueryParser(['one', 'two'], StandardAnalyzer()) q = qp.parse("foo bar", ['one', 'two'], [BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD], StandardAnalyzer()) self.assertEqual(str(q), "(one:foo one:bar) (two:foo two:bar)")
def retrieve(command): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) except ValueError: print "JVM running." print 'lucene', lucene.VERSION base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer() # to convert to AND query command = re.sub(r' ', r' +', command) command = "+" + command print "Searching for:", command query = QueryParser("contents", analyzer).parse(command) print query scoreDocs = searcher.search(query, 500).scoreDocs print "%s total matching documents." % len(scoreDocs) retrieved_docs = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) retrieved_docs.append(os.path.join(doc.get("path"), doc.get("name"))) del searcher return retrieved_docs
def create_index(self, index_folder, docs_path, add_terms=False): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close()
def main(): INDEX_DIR = "indexes" try: print "Indexing..." indexDir = File("/Users/Raphael/Downloads/stackoverflow1107") #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(), "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer() } wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indexDir), config) index_code_snippet(writer) writer.commit() writer.close() print "Done" except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace() except SQLException as e: #when Database error occurs e.printStackTrace()
def __init__(self, index_store_path): store = NIOFSDirectory(Paths.get(index_store_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) self.writer = IndexWriter(store, config)
def setUp(self): super(PyLuceneThreadTestCase, self).setUp() self.classLoader = Thread.currentThread().getContextClassLoader() writer = self.getWriter(analyzer=StandardAnalyzer()) doc1 = Document() doc2 = Document() doc3 = Document() doc4 = Document() doc1.add(Field("field", "one", TextField.TYPE_STORED)) doc2.add(Field("field", "two", TextField.TYPE_STORED)) doc3.add(Field("field", "three", TextField.TYPE_STORED)) doc4.add(Field("field", "one", TextField.TYPE_STORED)) writer.addDocument(doc1) writer.addDocument(doc2) writer.addDocument(doc3) writer.addDocument(doc4) writer.commit() writer.close() self.testData = [('one', 2), ('two', 1), ('three', 1), ('five', 0)] * 500 self.lock = threading.Lock() self.totalQueries = 0
def testGiga(self): w = self.getWriter(analyzer=StandardAnalyzer()) self._addDoc("Lucene in Action", w) self._addDoc("Lucene for Dummies", w) self._addDoc("Giga byte", w) self._addDoc("ManagingGigabytesManagingGigabyte", w) self._addDoc("ManagingGigabytesManagingGigabytes", w) self._addDoc("The Art of Computer Science", w) self._addDoc("J. K. Rowling", w) self._addDoc("JK Rowling", w) self._addDoc("Joanne K Roling", w) self._addDoc("Bruce Willis", w) self._addDoc("Willis bruce", w) self._addDoc("Brute willis", w) self._addDoc("B. willis", w) r = w.getReader() w.close() q = FuzzyQuery(Term("field", "giga"), 0) searcher = self.getSearcher(reader=r) hits = searcher.search(q, 10).scoreDocs self.assertEqual(1, len(hits)) self.assertEqual("Giga byte", searcher.doc(hits[0].doc).get("field"))
def __init__(self, index_path, query=None): self.index_path = index_path self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index()
def __init__(self, index_path, method, logger=None, use_default_similarity=False): self.index_path=index_path directory = SimpleFSDirectory(File(self.index_path)) self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) self.reader=DirectoryReader.open(directory) self.searcher = IndexSearcher(self.reader) # uncomment one of these lines to change the type of parser, query and weight used if use_default_similarity: self.query_parser=QueryParser else: self.query_parser=FieldAgnosticQueryParser if use_default_similarity: similarity=DefaultSimilarity() self.useExplainQuery=False else: similarity=FieldAgnosticSimilarity() self.useExplainQuery=True # by default, FieldAgnosticSimilarity uses coord factor, can be disabled ## similarity.useCoord=False self.searcher.setSimilarity(similarity) self.method=method # never used? self.logger=logger
def run_music(ID): STORE_DIR = "index" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "id", analyzer).parse(ID) scoreDocs = searcher.search(query, 1).scoreDocs try: scoreDoc = scoreDocs[0] except: return None doc = searcher.doc(scoreDoc.doc) item = [] item.append(doc.get("song_title").encode('utf-8')) item.append(doc.get('song_url')) item.append(doc.get("singer").encode('utf-8')) item.append(doc.get("album").encode('utf-8')) item.append(doc.get("album_pic")) item.append(doc.get("album_genre").encode('utf-8')) item.append(doc.get("lyrics").encode('utf-8')) sim_str = doc.get("similar").encode('utf-8') sim_list = sim_str.split('+') for i in range(3): sim_list[i] = sim_list[i].split('*') item.append(sim_list) del searcher return item
def testCompressionTools(self): bytes = JArray('byte')(self.binaryValCompressed) binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes)) stringFldCompressed = StoredField( "stringCompressed", CompressionTools.compressString(self.binaryValCompressed)) doc = Document() doc.add(binaryFldCompressed) doc.add(stringFldCompressed) # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer()) writer.addDocument(doc) writer.close() # open a reader and fetch the document reader = self.getReader() docFromReader = reader.document(0) self.assertTrue(docFromReader is not None) # fetch the binary compressed field and compare it's content with # the original one bytes = CompressionTools.decompress( docFromReader.getBinaryValue("binaryCompressed")) binaryFldCompressedTest = bytes.string_ self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed) self.assertEqual( CompressionTools.decompressString( docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed) reader.close()
def __init__(self, db_path): directory = SimpleFSDirectory(File(db_path)) reader = DirectoryReader.open(directory) self.searcher = IndexSearcher(reader) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) logger.info("Loaded DB from %s with %d documents: ", db_path, reader.numDocs())
def build_index(): lucene.initVM() # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/' post_dir = '/Users/w3/data/github/codeif_backup' index_store_dir = current_app.config['INDEX_STORE_DIR'] print post_dir print index_store_dir analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File(index_store_dir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) indexDocs(post_dir, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def main(): try: indicesDestination = File(dest_path) analyzer = KeywordAnalyzer() porter_analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) a = {"code": porter_analyzer, "description": porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": porter_analyzer} wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indicesDestination), config) counter = Counter() generate_indices_from_benchmark(writer, counter) writer.close() print "All jobs are done.." print str(counter) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def shourcut_retriever(keyword): '''查询器:在简介中查询''' global flag if flag: lucene.initVM() flag = False analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_4_10_1, "shortcut", analyzer).parse(keyword) MAX = 20 hits = searcher.search(query, MAX) print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) results = [] for hit in hits.scoreDocs: print(hit.score, hit.doc, hit.toString()) doc = searcher.doc(hit.doc) result = [doc.get('shortcut'), doc.get('url'), doc.get('name')] print(doc.get('url')) results.append(result) return results
def __init__(self, indexDir): f = Paths.get(indexDir) self._dir = SimpleFSDirectory(f) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self._writer = IndexWriter(self._dir, config)
def testCachingWorks(self): writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)) writer.close() reader = SlowCompositeReaderWrapper.wrap(self.getReader()) context = AtomicReaderContext.cast_(reader.getContext()) class mockFilter(PythonFilter): def __init__(self): super(mockFilter, self).__init__() self._wasCalled = False def getDocIdSet(self, context, acceptDocs): self._wasCalled = True; return FixedBitSet(context.reader().maxDoc()) def clear(self): self._wasCalled = False def wasCalled(self): return self._wasCalled filter = mockFilter() cacher = CachingWrapperFilter(filter) # first time, nested filter is called strongRef = cacher.getDocIdSet(context, context.reader().getLiveDocs()) self.assert_(filter.wasCalled(), "first time") # second time, nested filter should not be called filter.clear() cacher.getDocIdSet(context, context.reader().getLiveDocs()) self.assert_(not filter.wasCalled(), "second time") reader.close()
def run(command): global vm_env STORE_DIR = "index" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 10).scoreDocs #print "%s total matching documents." % len(scoreDocs) res = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tmp = [] tmp.append([doc.get('name1'), doc.get('name2')]) tmp.append(doc.get("homepage")) tmp.append(doc.get("intro")) tmp.append(doc.get('logo')) a = doc.get('goods') a = a.split('\n') for i in a: tmp.append(i) res.append(tmp) return command, res
def create_index(storage, paths): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths: for filen in os.listdir(path): text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3): doc = Document() a = i - 5 if i - 5 > 0 else 0 sentence = ' '.join(text[a:i + 5]) doc.add( Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path + filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def setUp(self): super(Test_Bug1763, self).setUp() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.d1 = RAMDirectory() self.d2 = RAMDirectory() w1, w2 = [ self.getWriter(directory=d, analyzer=self.analyzer) for d in [self.d1, self.d2] ] doc1 = Document() doc2 = Document() doc1.add( Field("all", "blah blah double blah Gesundheit", TextField.TYPE_NOT_STORED)) doc1.add(Field('id', '1', StoredField.TYPE)) doc2.add( Field("all", "a quick brown test ran over the lazy data", TextField.TYPE_NOT_STORED)) doc2.add(Field('id', '2', StoredField.TYPE)) w1.addDocument(doc1) w2.addDocument(doc2) for w in [w1, w2]: w.close()
def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Indexer. Parameters ---------- index_dir : string The location of lucene index mode : string The mode when opening lucene index. Available values are: 'create', open new index and overwriting over index, 'append', open existed index and append. 'create_or_append', if `index_dir` exists, 'append', else 'create' date_format : string We save datetime field as string, `date_format` specify how to format datetime into string. """ # self.store = FSDirectory.open(File(index_dir)) self.store = FSDirectory.open(Paths.get(index_dir)) # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer = StandardAnalyzer() # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config = IndexWriterConfig(self.analyzer) self.mode = mode self.date_format = date_format if mode == 'create_or_append': self.config.setOpenMode( IndexWriterConfig.OpenMode.CREATE_OR_APPEND) elif mode == 'create': self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) elif mode == 'append': self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) else: raise ValueError('Invalid mode %s', mode) self.writer = IndexWriter(self.store, self.config)
def main(): LUCENE_INDEX_DIR = 'mmapDirectory/trec_v15_wikipedia_stemmed_v2' try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) #config=config.setRAMBufferSizeMB(1024.0) # experimental setting !! # write data to index if not is_index_Exist: #if True: print('begin backup code files') system_flag = platform.system() if system_flag == 'Windows': os.system('robocopy %s %s\code_files *.py' % (r'%cd%', LUCENE_INDEX_DIR)) else: os.system('mkdir %s/code_files' % (LUCENE_INDEX_DIR)) os.system('cp *.py %s/code_files' % (LUCENE_INDEX_DIR)) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
def build_index(document_path, dir_path): lucene.initVM() index_dir = SimpleFSDirectory(Paths.get(dir_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(index_dir, config) t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(False) with open(document_path) as input_file: for line in input_file: segs = line.strip().split(" ") music_path, music_tags = segs[0], segs[1].split(",") document = Document() document.add(Field("content", " ".join(music_tags), t1)) document.add(Field("url", music_path, t2)) index_writer.addDocument(document) index_writer.close()
def retrieve_sents(self): indexDir = self.indexDir query = self.query sent_ind_list = [] # template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) # print indexDir searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(query) # print query start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start # print query if self.stats: print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % ( len(scoreDocs), duration, query) for scoreDoc in scoreDocs: # print scoreDoc.doc # doc = searcher.doc(scoreDoc.doc) sent_ind_list.append(scoreDoc.doc) return sent_ind_list
def getWriter(self, store, analyzer=None, create=False): if analyzer is None: analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) return writer