def removeindex(self, data): writer = IndexWriter( self.d, self.conf) writer.deleteDocuments(lucene.Term("_id", data['record']['_id'])) writer.optimize() writer.close()
def testDelete(self, fieldName, searchString): analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) writer.deleteDocuments(Term(fieldName, searchString)) writer.close()
def deleteRec(self, pid): config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(self.indexDir, config) writer.deleteDocuments(Term('uid', pid)) writer.commit() writer.close() self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) return
def delete(indexDir: str, id: str): index_dir = SimpleFSDirectory(Paths.get(indexDir)) config = IndexWriterConfig(StandardAnalyzer()) index_writer = IndexWriter(index_dir, config) delete_term_query = RegexpQuery(Term('id', id)) delete_reg_query = RegexpQuery(Term('id', id + '\..*')) index_writer.deleteDocuments(delete_term_query) index_writer.deleteDocuments(delete_reg_query) index_writer.commit() index_writer.close()
def delete(collection_name,todelete,commit=False): if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs=json.loads(todelete) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) ireader=IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map[collection_name]: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents according to primary keys query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) writer.deleteDocuments(query) if commit==True: writer.commit() writer.close()
def delete(collection_name, todelete, commit=False): if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(todelete) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) ireader = IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map[collection_name]: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents according to primary keys query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) writer.deleteDocuments(query) if commit == True: writer.commit() writer.close() return 000
def updateDeleteRec(self, pid1, pid2, personDB, familyDB, relationDB): config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(self.indexDir, config) mt = matchtext() writer.deleteDocuments(Term('uid', pid1)) writer.deleteDocuments(Term('uid', pid2)) p = personDB.find_one({'_id': pid1}) matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB) doc = Document() doc.add(Field('uid',str(pid1), StringField.TYPE_STORED)) doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED)) doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED)) doc.add(Field("text", mt.luceneFix(self.personText(p)), TextField.TYPE_NOT_STORED)) writer.addDocument(doc) writer.commit() writer.close() self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) return
def updateDeleteRec(self, pid1, pid2, personDB, familyDB, relationDB): config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(self.indexDir, config) mt = matchtext() writer.deleteDocuments(Term('uid', pid1)) writer.deleteDocuments(Term('uid', pid2)) p = personDB.find_one({'_id': pid1}) matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB) doc = Document() doc.add(Field('uid', str(pid1), StringField.TYPE_STORED)) doc.add(Field('sex', str(p['sex']), StringField.TYPE_STORED)) doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED)) doc.add( Field("text", mt.luceneFix(self.personText(p)), TextField.TYPE_NOT_STORED)) writer.addDocument(doc) writer.commit() writer.close() self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) return
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher1 = IndexSearcher(reader) searcher1.setSimilarity(BM25Similarity()) searcher2 = IndexSearcher(reader) w = IndexWriter(index_mm,config) # read query read_query() # initialize mongodb client mongoObj=Mongo_Object('localhost',27017) # search docDup=set() finalDup={} for i in xrange(len(queries)): print 'process query %d' %(i) query = queries[i] querystr = stemSentence(query[3]) # build searcher q_lucene = QueryParser("all_text", analyzer).parse(querystr) collector = TopScoreDocCollector.create(hitsPerPage); searcher1.search(q_lucene, collector); hits = collector.topDocs().scoreDocs; # find candidate results after 1st round filter docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) if d['title'] in docDup: finalDup[d['title']]=d continue docDup.add(d['title']) docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) title=d['title'] if d['title'] in docDup: continue docDup.add(title) item=(mongoObj.conn_me).find_one({'title':title}) if item is None: continue entitylist=item['entitylist'].split('|') for en_title in entitylist: if title==en_title: continue t=Term('title',en_title) q=TermQuery(t) docs=searcher2.search(q,2) if docs.totalHits<=1: continue docID2=(docs.scoreDocs)[0].doc doc=searcher2.doc(docID2) finalDup[doc['title']]=doc print 'begin to clean index, there are %d dup records' %(len(finalDup)) for title in finalDup: doc=finalDup[title] # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract name=doc['name'] value=doc['value'] category=doc['category'] skos_category=doc['skos_category'] all_text=doc['all_text'] raw_name=doc['raw_name'] raw_value=doc['raw_value'] abstract=doc['abstract'] print 'process '+title t=Term('title',title) q=TermQuery(t) w.deleteDocuments(q) addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract) # process remaining records #global batch,cnt_batch #if cnt_batch>0: #w.addDocuments(batch) #cnt_batch=0 #del batch[:] w.close()
continue # Delete existing abstracts. This is useful when adding # update files from Medline. try: assert 'pmid' in json_doc pmid_query = TermQuery(Term('pmid', json_doc['pmid'])) pmcid_query = TermQuery(Term('pmcid', json_doc['pmcid'])) id_query = IntPoint.newRangeQuery("id", json_doc['id'], json_doc['id']) bq = BooleanQuery.Builder() bq.add(pmid_query, BooleanClause.Occur.MUST) bq.add(pmcid_query, BooleanClause.Occur.MUST) bq.add(id_query, BooleanClause.Occur.MUST) q = bq.build() writer.deleteDocuments(q) # Add whole abstract. doc = Document() # Store field. doc.add(IntPoint('id', json_doc['id'])) # index doc.add(StoredField('id', json_doc['id'])) # store doc.add(StringField('pmid', json_doc['pmid'], Field.Store.YES)) doc.add( StringField('pmcid', json_doc['pmcid'], Field.Store.YES)) # Index only. doc.add( StringField('article_type', json_doc['article_type'], Field.Store.NO)) doc.add(StringField('type', json_doc['type'], Field.Store.NO)) doc.add(
class LuceneManager(object): def __init__(self, index_root_loc, index_subdir_name='.siftindex/index'): self.index_root_loc = index_root_loc self.index_subdir_name = index_subdir_name def __enter__(self): """ Used by "with" statement. Like an "open" / "init" method. """ if lucene.getVMEnv() is None: lucene.initVM(vmargs=['-Djava.awt.headless=true']) index_path = Path(self.index_root_loc).joinpath('%s/' % self.index_subdir_name) index_path.mkdir(parents=True, exist_ok=True) store = SimpleFSDirectory(Paths.get(str(index_path))) self.analyzer = StandardAnalyzer() config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) # IndexWriter self.writer = IndexWriter(store, config) # IndexReader self.reader = DirectoryReader.open(self.writer) # IndexSearcher self.searcher = IndexSearcher(self.reader) return self def insert(self, document): self.writer.addDocument(document) return document['key'] def delete(self, key): self.writer.deleteDocuments(Term('key', key)) return key def delete_all(self): self.writer.deleteAll() def num_docs(self): return self.reader.numDocs() def update(self, key, document): # atomic delete and add self.writer.updateDocument(Term('key', key), document) return key def exists(self, key): boolean_query = BooleanQuery.Builder() boolean_query.add(TermQuery(Term('key', key)), BooleanClause.Occur.MUST) results = self.searcher.search(boolean_query.build(), 1) return results.totalHits > 0 def commit(self): self.writer.commit() # make IndexReader reflect index updates # TODO: try IndexReader.isCurrent() new_reader = DirectoryReader.openIfChanged(self.reader) if new_reader is not None: self.reader.close() # note: not thread safe, may need to revisit self.reader = new_reader self.searcher = IndexSearcher(self.reader) def _process_search_result(self, result, highlighter=None): docid = result.doc # this is not a stable identifier # obtain document through an IndexReader doc = self.searcher.doc(docid) # doc.getFields() -> field.name(), field.stringValue() # use highlighter to extract relevant part of body highlighted_text = '' if highlighter: contents = doc['body'] token_stream = self.analyzer.tokenStream('body', contents) n_fragments = 3 fragment_separator = '...' highlighted_text = highlighter.getBestFragments( token_stream, contents, n_fragments, fragment_separator) return { 'fullpath': doc['fullpath'], 'last_modified_time': doc['last_modified_time'], 'score': result.score, 'excerpt': highlighted_text } def search(self, terms, n_hits=5): """ Run search query. """ # TODO: support date range queries # build query parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer) #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier query = MultiFieldQueryParser.parse( parser, terms) # https://stackoverflow.com/a/26853987/130164 # create a highlighter highlighter = Highlighter(SimpleHTMLFormatter('*', '*'), QueryScorer(query)) # execute search for top N hits return [ self._process_search_result(result, highlighter) for result in self.searcher.search(query, n_hits).scoreDocs ] def get_all_docs(self, n_hits=1000): # debug method return [ self._process_search_result(result) for result in self.searcher.search(MatchAllDocsQuery(), n_hits).scoreDocs ] def __exit__(self, type, value, traceback): """ Used by the "with" statement. Handles close. TODO: error handling """ self.writer.close() self.reader.close() def debug_analyzer(self, text): """ Debug what StandardAnalyzer will give on this text. Ref: https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/analysis/package-summary.html Ref: pylucene tests --> test_Analyzers.py, BaseTokenStreamTestCase.py """ token_stream = self.analyzer.tokenStream('field', text) termAtt = token_stream.getAttribute(CharTermAttribute.class_) token_stream.reset() tokens = [] while token_stream.incrementToken(): #tokens.append(token_stream.reflectAsString(True)) tokens.append(termAtt.toString()) token_stream.end() token_stream.close() return tokens
def testDelete(self, fieldName, searchString): config = IndexWriterConfig(Version.LUCENE_CURRENT, self.getAnalyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(self.dir, config) writer.deleteDocuments(Term(fieldName, searchString)) writer.close()
class Index(object): def __init__(self, path, settings): self._settings = settings self._multithreaded = settings.multithreaded self._checker = DirectSpellChecker() indexDirectory = MMapDirectory(File(join(path, 'index'))) indexDirectory.setUseUnmap(False) taxoDirectory = MMapDirectory(File(join(path, 'taxo'))) taxoDirectory.setUseUnmap(False) conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer) conf.setSimilarity(settings.similarity) mergePolicy = TieredMergePolicy() mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce) mergePolicy.setSegmentsPerTier(settings.segmentsPerTier) conf.setMergePolicy(mergePolicy) if not settings.readonly: self._indexWriter = IndexWriter(indexDirectory, conf) self._indexWriter.commit() self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize)) self._taxoWriter.commit() self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory) self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper self._facetsConfig = settings.fieldRegistry.facetsConfig self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader()) def addDocument(self, term, document): document = self._facetsConfig.build(self._taxoWriter, document) self._indexWriter.updateDocument(term, document) def deleteDocument(self, term): self._indexWriter.deleteDocuments(term) def search(self, query, filter, collector): self._indexAndTaxonomy.searcher.search(query, filter, collector) def suggest(self, query, count, field): suggestions = {} for token, startOffset, endOffset in self._analyzeToken(query): suggestWords = self._checker.suggestSimilar(Term(field, token), count, self._indexAndTaxonomy.searcher.getIndexReader()) if suggestWords: suggestions[token] = (startOffset, endOffset, [suggestWord.string for suggestWord in suggestWords]) return suggestions def termsForField(self, field, prefix=None, limit=10, **kwargs): convert = lambda term: term.utf8ToString() terms = [] termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field) if termsEnum is None: return terms iterator = termsEnum.iterator(None) if prefix: iterator.seekCeil(BytesRef(prefix)) terms.append((iterator.docFreq(), convert(iterator.term()))) bytesIterator = BytesRefIterator.cast_(iterator) try: while len(terms) < limit: term = convert(bytesIterator.next()) if prefix and not term.startswith(prefix): break terms.append((iterator.docFreq(), term)) except StopIteration: pass return terms def fieldnames(self): indexAndTaxonomy = self._indexAndTaxonomy fieldnames = [] fields = MultiFields.getFields(indexAndTaxonomy.searcher.getIndexReader()) if fields is None: return fieldnames iterator = fields.iterator() while iterator.hasNext(): fieldnames.append(iterator.next()) return fieldnames def drilldownFieldnames(self, path=None, limit=50): taxoReader = self._indexAndTaxonomy.taxoReader parentOrdinal = TaxonomyReader.ROOT_ORDINAL if path is None else taxoReader.getOrdinal(path[0], path[1:]) childrenIter = taxoReader.getChildren(parentOrdinal) names = [] while True: ordinal = childrenIter.next() if ordinal == TaxonomyReader.INVALID_ORDINAL: break names.append(taxoReader.getPath(ordinal).components[-1]) if len(names) >= limit: break return names def numDocs(self): return self._indexAndTaxonomy.searcher.getIndexReader().numDocs() def commit(self): if not self._settings.readonly: self._taxoWriter.commit() self._indexWriter.commit() self._indexAndTaxonomy.reopen() def getDocument(self, docId): return self._indexAndTaxonomy.searcher.doc(docId) def createFacetCollector(self): if not self._multithreaded: return FacetsCollector() return FacetSuperCollector(self._indexAndTaxonomy.taxoReader, self._facetsConfig, self._ordinalsReader) def facetResult(self, facetCollector): facetResult = TaxonomyFacetCounts(self._ordinalsReader, self._indexAndTaxonomy.taxoReader, self._facetsConfig, facetCollector) return Facets.cast_(facetResult) def close(self): self._indexAndTaxonomy.close() if not self._settings.readonly: self._taxoWriter.close() self._indexWriter.close() def _analyzeToken(self, token): result = [] reader = StringReader(unicode(token)) stda = self._settings.analyzer ts = stda.tokenStream("dummy field name", reader) termAtt = ts.addAttribute(CharTermAttribute.class_) offsetAtt = ts.addAttribute(OffsetAttribute.class_) try: ts.reset() while ts.incrementToken(): result.append((termAtt.toString(), offsetAtt.startOffset(), offsetAtt.endOffset())) ts.end() finally: ts.close() return result
class LuceneHelper: def __init__(self, index_dir): self.index_dir = index_dir self.indexDir = SimpleFSDirectory(File(self.index_dir).toPath()) self.q_parser = QueryParser("", WhitespaceAnalyzer()) self.commit_max = 500000 self.__get_writer_searcher() def __get_writer_searcher(self): writerConfig = IndexWriterConfig() print(f"Codec : {writerConfig.getCodec()}") self.writer = IndexWriter(self.indexDir, writerConfig) self.reader = DirectoryReader.open(self.writer) self.searcher = IndexSearcher(self.reader) def __query(self, query_str, _max=10): if self.searcher is None: return None query_cmd = self.q_parser.parse(query_str) hits = self.searcher.search(query_cmd, _max) print( f"Found {hits.totalHits} document(s) that matched query :'{query_cmd}'" ) return hits def __count_docs(self, query_str): if self.searcher is None: return None query_cmd = self.q_parser.parse(query_str) total = self.searcher.count(query_cmd) print(f"Found {total} document(s) that matched query :'{query_cmd}'") return total def refresh_searcher(self): self.reader.close() self.reader = DirectoryReader.open(self.indexDir) self.searcher = IndexSearcher(self.reader) def index_stats(self): query_str = f"*:*" total_docs = self.__count_docs(query_str) if total_docs: print(f"There is at least total [{total_docs}] docs.") else: print("There is no index right now.") def delete_old_ttl(self): now_time = int(time.time()) # check how many docs expired ttl_query = LongPoint.newRangeQuery("ttl", 0, now_time - 1) total_docs = self.searcher.count(ttl_query) print(f"At least found {total_docs} document(s) are expired.") # delete expired docs self.writer.deleteDocuments(ttl_query) self.writer.commit() def add_doc(self, item_data): item_id = item_data['item_id'] ttl = item_data['ttl'] version = item_data.get('version', 'default') view_similar = json.dumps(item_data.get('view_similar', {})) view_prospective = json.dumps(item_data.get('view_prospective', {})) doc = Document() _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest() doc.add(StringField("id", _id, Field.Store.NO)) doc.add(LongPoint("ttl", ttl)) doc.add(StringField("version", version, Field.Store.YES)) doc.add(StringField("item_id", item_id, Field.Store.YES)) doc.add(StoredField("view_similar", view_similar)) doc.add(StoredField("view_prospective", view_prospective)) self.writer.updateDocument(Term("id", _id), doc) def commit(self): self.writer.commit() def close(self): self.writer.commit() self.reader.close() self.writer.close()