def __init__(self, index_store_path): store = NIOFSDirectory(Paths.get(index_store_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) self.writer = IndexWriter(store, config)
def run(self): print "Booting lucene driver worker...." lucene.initVM() self.fieldType1 = FieldType() self.fieldType1.setIndexed(True) self.fieldType1.setStored(False) self.fieldType1.setTokenized(True) self.fieldType2 = FieldType() self.fieldType2.setIndexed(True) self.fieldType2.setStored(True) self.fieldType2.setTokenized(False) while(True): data = self.queue.get() da = data[1] response = None try: self.fil = File(da['data']['indexdir']) self.d = NIOFSDirectory(self.fil) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.conf = IndexWriterConfig( Version.LUCENE_CURRENT, self.analyzer) response = getattr(self, da['action'])(da['data']) self.d.close() except Exception as e: print e if response is None: response = {} self.ret[data[0]] = response
def __init__(self, corpusPath, storeDir): if not os.path.exists(storeDir): os.mkdir(storeDir) store = NIOFSDirectory(Paths.get(storeDir)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(corpusPath, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
i = doc.getValues('bestbets') self.bets = [] if i: self.bets = i[0].split() def do_query(property, qstring, limit = 10): query = BooleanQuery() stream = analyzer.tokenStream(property, StringReader(qstring)) stream.reset() attr = stream.getAttribute(CharTermAttribute) while stream.incrementToken(): term = attr.toString() termQuery = TermQuery(Term(property, term)) query.add(termQuery, Occur.SHOULD) hits = searcher.search(query, None, limit).scoreDocs return [Document(searcher.doc(hit.doc)) for hit in hits] path = 'lucene-ix' analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) directory = NIOFSDirectory.open(File(path)) reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) if __name__ == '__main__': value = ' '.join(sys.argv[1 : ]) for doc in do_query('indicators', value): print doc.title
Must be run with Jython. ''' import movielib from java.io import File from org.apache.lucene.util import Version from org.apache.lucene.store import NIOFSDirectory from org.apache.lucene.index import IndexWriterConfig, IndexWriter from org.apache.lucene.document import Field, Document from org.apache.lucene.analysis.standard import StandardAnalyzer # --- OPEN SEARCH INDEX path = "lucene-ix" analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) directory = NIOFSDirectory.open(File(path)) cfg = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iwriter = IndexWriter(directory, cfg) # --- START INDEXING print 'Loading metadata' movies = {} for (movieid, title, cats) in movielib.get_movies(): movies[movieid] = (title, cats) print 'Loading best bets' bets = {} for line in open('best-bets.txt'): row = line.split() id = row[0]
class Worker(Process): """This class represents the lucene worker object, which will run concurrently with other workers, and contains the actual lucene logic.""" def __init__(self, config, queue, ret): Process.__init__(self) self.config = config self.connection = None self.channel = None self.queue = queue self.ret = ret def run(self): print "Booting lucene driver worker...." lucene.initVM() self.fieldType1 = FieldType() self.fieldType1.setIndexed(True) self.fieldType1.setStored(False) self.fieldType1.setTokenized(True) self.fieldType2 = FieldType() self.fieldType2.setIndexed(True) self.fieldType2.setStored(True) self.fieldType2.setTokenized(False) while(True): data = self.queue.get() da = data[1] response = None try: self.fil = File(da['data']['indexdir']) self.d = NIOFSDirectory(self.fil) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.conf = IndexWriterConfig( Version.LUCENE_CURRENT, self.analyzer) response = getattr(self, da['action'])(da['data']) self.d.close() except Exception as e: print e if response is None: response = {} self.ret[data[0]] = response def rebuildIndex(self, data): writer = IndexWriter( self.d, self.conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)) for record in data['records']: doc = self.buildDocument(data['fields'], record) writer.addDocument(doc) writer.commit() writer.close() def buildDocument(self, fields, record): doc = Document() doc.add( Field("id", record["_id"], self.fieldType2)) for field in fields: if isinstance(record[field], dict): self.dictToFields(doc, record[field]) else: doc.add( Field(field, record[field], self.fieldType1)) return doc def dictToFields(self, doc, record): for key in record: if isinstance(record[key], dict): self.dictToFields(doc, record[key]) else: doc.add( Field(key, record[key], self.fieldType1)) def index(self, data): writer = IndexWriter( self.d, self.conf) doc = self.buildDocument(data['fields'], data['record']) writer.addDocument(doc) writer.commit() writer.close() def updateindex(self, data): writer = IndexWriter( self.d, self.conf) doc = self.buildDocument(data['fields'], data['record']) writer.updateDocument(lucene.Term("_id", data['record']['_id']), doc) writer.optimize() writer.close() def removeindex(self, data): writer = IndexWriter( self.d, self.conf) writer.deleteDocuments(lucene.Term("_id", data['record']['_id'])) writer.optimize() writer.close() def query(self, data): if self.fil.exists(): searcher = IndexSearcher(DirectoryReader.open(self.d)) query = QueryParser( Version.LUCENE_30, "id", self.analyzer).parse( data['query']) hits = searcher.search(query, 100000) results = {} results['totalHits'] = hits.totalHits results['hits'] = {} for hit in hits.scoreDocs: record = {} doc = searcher.doc(hit.doc) fields = doc.getFields() record['score'] = hit.score for field in fields: if field.name() != "id": record[field.name()] = field.stringValue() results['hits'][doc.get('id')] = record searcher.getIndexReader().close() return results
help='qa data for evaluation', default='/home/xwhan/data/nq/nq-dev.txt') parser.add_argument('--topk', type=int, default=500) args = parser.parse_args() qas = [json.loads(line) for line in open(args.qa_data).readlines()][:1000] questions = [ _["question"][:-1] if _["question"].endswith("?") else _["question"] for _ in qas ] answers = [item["answer"] for item in qas] print("Loading Lucene Index ...") lucene.initVM(vmargs=['-Djava.aws.headless=true']) analyzer = StandardAnalyzer() searchDir = NIOFSDirectory(Paths.get(args.index_path)) searcher = IndexSearcher(DirectoryReader.open(searchDir)) # try tuning the hyperparameters of bm25 for k1 in [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]: for b in [0.5, 0.6, 0.7, 0.8, 0.9]: print(f"Grid search.... k1: {k1}; b: {b}") searcher.setSimilarity(BM25Similarity(k1, b)) parser = QueryParser('Context', analyzer) retrieved = [] print("Searching ...") for q in tqdm(questions):