def updateindex(self, data): writer = IndexWriter( self.d, self.conf) doc = self.buildDocument(data['fields'], data['record']) writer.updateDocument(lucene.Term("_id", data['record']['_id']), doc) writer.optimize() writer.close()
class IndexBuilder(object): def __init__(self, index_path, update=False): dir = FSDirectory.open(Paths.get(index_path)) analyzer = StandardAnalyzer() iwc = IndexWriterConfig(analyzer) if update: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) else: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(dir, iwc) def index_docs(self, input_documents): for document in tqdm(input_documents, total=len(input_documents)): doc = Document() doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES)) doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES)) type = FieldType() type.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) type.setStored(True) type.setStoreTermVectors(True) type.setTokenized(True) if ".W" in document and ".M" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower() + document[".W"].lower())), type)) elif ".M" in document and ".W" not in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower())), type)) elif ".M" not in document and ".W" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".T"].lower() + document[".W"].lower())), type)) elif ".M" not in document and ".W" not in document: doc.add( Field("text", " ".join(tokenizer.tokenize(document[".T"].lower())), type)) if self.writer.getConfig().getOpenMode( ) == IndexWriterConfig.OpenMode.CREATE: self.writer.addDocument(doc) else: self.writer.updateDocument(Term(".U", document[".U"]), doc) self.writer.close()
def IndexDocs(self, documents): """ Index documents under the directory :Parameters: - `documents`: Documents to be indexed (List) """ # Get the Writer Configuration writerConfig = IndexWriterConfig(self.__analyzer) # Get index writer writer = IndexWriter(self.__indexDir, writerConfig) for document in documents: # Create a document that would we added to the index doc = Document() # Add a field to this document doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES)) doc.add( Field(Indexer.CONTENT, document['content'], self.__contentType)) doc.add( StringField(Indexer.DATE, document['date'], Field.Store.YES)) doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES)) doc.add( TextField(Indexer.TAGS, self.__qualifyTags(document['tags']), Field.Store.YES)) doc.add( LongPoint(Indexer.TIMESTAMP, self.__getTimestamp(document['date']))) # Add or update the document to the index if not self.__boAppend: # New index, so we just add the document (no old document can be there): if self.__verbose: print("Adding " + document['name']) writer.addDocument(doc) else: # Existing index (an old copy of this document may have been indexed) so # we use updateDocument instead to replace the old one matching the exact # path, if present: if self.__verbose: print("Updating " + document['name']) writer.updateDocument(Term(Indexer.NAME, document['name']), doc) # Print index information and close writer print("Indexed %d documents (%d docs in index)" % (len(documents), writer.numDocs())) writer.close()
def survey(request): ipAddr = get_client_ip(request) instances = (Classes.objects.values_list('image_class_desc')) instances = [i[0] for i in instances] #cnt = len(instances) #lets get out choice location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) try: #image_class = image.objects.get(pk=request.POST['survey']) s = request.POST['survey']#get from post except (KeyError, Classes.DoesNotExist): return render(request, 'web/index.html',{ 'error_message': "You didn't select a choice.", }) else: image_class = instances[int(s)] docNum = request.POST['imageID']#get document id doc = reader.document(int(docNum)) fname = doc.get("filename") print(fname) #SimpleFSDirectory(File(location)).clearLock(IndexWriter.WRITE_LOCK_NAME); fileClassField = doc.get("Classification") if str(fileClassField) == "None":#check if the field exists####NEED TO CHECK THIS fileClassField = str(ipAddr + ":" + image_class)#I think we must add an ip address to this else: fileClassField = str(ipAddr + ":" + fileClassField) + ", " + image_class #doc.removeField("Classification") #doc.add(StringField("Classification", fileClassField, Field.Store.YES)) #t = doc.get("Classification") #reader.close() indexDir = SimpleFSDirectory(File(location)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) fields = doc.getFields()#get all fields doc2 = Document() classificationFieldFlag = False for f in fields: field = Field.cast_(f) (k, v) = field.name(), field.stringValue() if k == "Classification": classificationFieldFlag = True field = StringField("Classification", fileClassField, Field.Store.YES) doc2.add(field) else: doc2.add(field) if classificationFieldFlag == False:#this does not exist in the document must add doc2.add(StringField("Classification", fileClassField, Field.Store.YES)) # doc2.add(StringField("Classification", fileClassField, Field.Store.YES)) # doc2.add(StringField("fid", doc.get("fid"), Field.Store.YES)) # doc2.add(StringField("articleid", doc.get("articleid"), Field.Store.YES)) # doc2.add(StringField("caption", doc.get("caption"), Field.Store.YES)) # doc2.add(StringField("figureid", doc.get("figureid"), Field.Store.YES)) # doc2.add(StringField("filename", doc.get("filename"), Field.Store.YES)) # doc2.add(StringField("filepath", doc.get("filepath"), Field.Store.YES)) # doc2.add(StringField("label", doc.get("label"), Field.Store.YES)) #writer.updateDocument(Term("fid","f000000000023"), doc2)#If field exists update writer.updateDocument(Term("fid", doc.get("fid")), doc2)#If field exists update writer.commit(); #writer.optimize() writer.close() #writer.unlock(SimpleFSDirectory(File(location))) return HttpResponseRedirect(reverse('web:index', args=()))
Berlin - "Gentrifizierung, die: Aufwertung eines Stadtteils durch dessen Sanierung oder Umbau mit der Folge, dass die dort ansässige Bevölkerung durch wohlhabendere Bevölkerungsschichten verdrängt wird." So beschreibt der Duden ein Phänomen, das vor allem in Großstädten zu beobachten ist: Viele Menschen ziehen in günstige Wohnviertel, durch die Nachfrage steigen die Preise, bis die Mieten schließlich nur noch für die Wohlhabenderen bezahlbar sind - und die Künstler, Studenten, Geringverdiener weichen müssen. In der Weihnachtszeit hat nun ein Fall aus Berlin für Aufsehen gesorgt, der von einigen als Beispiel für Gentrifizierung genannt wird. Auf Facebook hat ein Nutzer die aktuelle Infobroschüre einer evangelischen Kirchengemeinde in Berlin-Mitte gepostet, Seite 25, Rubrik "Taufen". 29 Namen von Kindern und Erwachsenen sind dort zu lesen, darunter die folgenden: Viktor Paul Theodor, Ada Mai Helene, Rufus Oliver Friedrich, Cäcilie Helene, Edvard Neo, Freya Luise Apollonia, Frederick Theodor Heinrich, Leonore Anna Maria Chiara Helena. Viele der Nachnamen lassen zudem auf einen adligen Hintergrund schließen. "Das Comeback alter Adelsgeschlechter in Berlin-Mitte = Gentrifizierung im eigentlichen Sinne", lautet ein Kommentar unter dem Facebook-Foto. "In Dresden Gorbitz sähe die Liste anders aus", schreibt ein anderer Nutzer. Ein dritter fasst zusammen: "So heißt also die Gentrifizierung." Im "Gentrification Blog", betrieben von einem wissenschaftlichen Mitarbeiter der Berliner Humboldt-Universität, ist vor kurzem ein Beitrag zu der Taufliste erschienen: "Berlin: Am Taufbecken der Gentrification - Kirche im Aufwertungsgebiet" heißt er. Die Liste lese sich "wie eine Mischung aus FDP-Wahlliste für das Europaparlament und dem Verzeichnis der höheren Beamten des Diplomatischen Dienstes", heißt es in dem Artikel. Und: "Der Wortsinn der Gentrification - der ja auf die Wiederkehr des niederen Landadels (der Gentry) in den Städten anspielt - bekommt hier jedenfalls einen unerwarteten Realitätsgehalt." Zu dem Artikel stellte der Autor eine Taufliste aus dem Jahr 2007 aus einer Gemeinde im benachbarten Stadtteil Prenzlauer Berg. Darauf sind unter anderem diese Namen zu finden: Ruby, Matteo, Iwan, Lennart, Emilia, Annabelle, Andreas, Anke. Jene Kirchgemeinde, in der die mondänen Namen zur Taufe aufgeführt sind, listet auch die Verstorbenen auf. Zwei sind es in der aktuellen Infobroschüre. Nzitu. Und: Herbert.""" lucene.initVM() # language processor and storage analyzer = PorterStemmerAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File('./data-test/')) # writes data to the index config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer, overwrite=True) writer = IndexWriter(store, config) # add Document doc = Document() doc.add(Field('content', test_text, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field('url', "http://test.com", Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.updateDocument(Term("url", "http://test.com"), doc) writer.commit() writer.close()
class LuceneManager(object): def __init__(self, index_root_loc, index_subdir_name='.siftindex/index'): self.index_root_loc = index_root_loc self.index_subdir_name = index_subdir_name def __enter__(self): """ Used by "with" statement. Like an "open" / "init" method. """ if lucene.getVMEnv() is None: lucene.initVM(vmargs=['-Djava.awt.headless=true']) index_path = Path(self.index_root_loc).joinpath('%s/' % self.index_subdir_name) index_path.mkdir(parents=True, exist_ok=True) store = SimpleFSDirectory(Paths.get(str(index_path))) self.analyzer = StandardAnalyzer() config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) # IndexWriter self.writer = IndexWriter(store, config) # IndexReader self.reader = DirectoryReader.open(self.writer) # IndexSearcher self.searcher = IndexSearcher(self.reader) return self def insert(self, document): self.writer.addDocument(document) return document['key'] def delete(self, key): self.writer.deleteDocuments(Term('key', key)) return key def delete_all(self): self.writer.deleteAll() def num_docs(self): return self.reader.numDocs() def update(self, key, document): # atomic delete and add self.writer.updateDocument(Term('key', key), document) return key def exists(self, key): boolean_query = BooleanQuery.Builder() boolean_query.add(TermQuery(Term('key', key)), BooleanClause.Occur.MUST) results = self.searcher.search(boolean_query.build(), 1) return results.totalHits > 0 def commit(self): self.writer.commit() # make IndexReader reflect index updates # TODO: try IndexReader.isCurrent() new_reader = DirectoryReader.openIfChanged(self.reader) if new_reader is not None: self.reader.close() # note: not thread safe, may need to revisit self.reader = new_reader self.searcher = IndexSearcher(self.reader) def _process_search_result(self, result, highlighter=None): docid = result.doc # this is not a stable identifier # obtain document through an IndexReader doc = self.searcher.doc(docid) # doc.getFields() -> field.name(), field.stringValue() # use highlighter to extract relevant part of body highlighted_text = '' if highlighter: contents = doc['body'] token_stream = self.analyzer.tokenStream('body', contents) n_fragments = 3 fragment_separator = '...' highlighted_text = highlighter.getBestFragments( token_stream, contents, n_fragments, fragment_separator) return { 'fullpath': doc['fullpath'], 'last_modified_time': doc['last_modified_time'], 'score': result.score, 'excerpt': highlighted_text } def search(self, terms, n_hits=5): """ Run search query. """ # TODO: support date range queries # build query parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer) #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier query = MultiFieldQueryParser.parse( parser, terms) # https://stackoverflow.com/a/26853987/130164 # create a highlighter highlighter = Highlighter(SimpleHTMLFormatter('*', '*'), QueryScorer(query)) # execute search for top N hits return [ self._process_search_result(result, highlighter) for result in self.searcher.search(query, n_hits).scoreDocs ] def get_all_docs(self, n_hits=1000): # debug method return [ self._process_search_result(result) for result in self.searcher.search(MatchAllDocsQuery(), n_hits).scoreDocs ] def __exit__(self, type, value, traceback): """ Used by the "with" statement. Handles close. TODO: error handling """ self.writer.close() self.reader.close() def debug_analyzer(self, text): """ Debug what StandardAnalyzer will give on this text. Ref: https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/analysis/package-summary.html Ref: pylucene tests --> test_Analyzers.py, BaseTokenStreamTestCase.py """ token_stream = self.analyzer.tokenStream('field', text) termAtt = token_stream.getAttribute(CharTermAttribute.class_) token_stream.reset() tokens = [] while token_stream.incrementToken(): #tokens.append(token_stream.reflectAsString(True)) tokens.append(termAtt.toString()) token_stream.end() token_stream.close() return tokens
class LuceneIndexer(object): def __init__(self): lucene.initVM() # language processor and storage self.analyzer = GermanAnalyzer(Version.LUCENE_CURRENT) self.store = SimpleFSDirectory(File('./../Lucene/data/')) # writes data to the index config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer, overwrite=True) self.writer = IndexWriter(self.store, config) def add_article(self, article): # constructing a document doc = Document() title = Field('title', article.title, Field.Store.YES, Field.Index.ANALYZED) title.setBoost(10.0) doc.add(title) description = Field('description', article.description, Field.Store.YES, Field.Index.ANALYZED) description.setBoost(5.0) doc.add(description) doc.add(Field('keywords', article.keywords, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('content', article.content, Field.Store.YES, Field.Index.ANALYZED)) if article.date: doc.add(Field('date', article.date, Field.Store.YES, Field.Index.NOT_ANALYZED)) if article.last_modified: doc.add(Field('last_modified', article.last_modified, Field.Store.YES, Field.Index.NOT_ANALYZED)) if article.images: doc.add(Field('image_url', article.images[0][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('image_text', article.images[0][1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field('url', article.url, Field.Store.YES, Field.Index.NOT_ANALYZED)) # creates document or updates if already exists self.writer.updateDocument(Term("url", article.url), doc) def write_to_file(self): # making changes permanent self.writer.commit() self.writer.close() def perform_search(self, searchterm): # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start print scoreDocs print duration
class Index(object): def __init__(self, path, settings): self._settings = settings self._multithreaded = settings.multithreaded self._checker = DirectSpellChecker() indexDirectory = MMapDirectory(File(join(path, 'index'))) indexDirectory.setUseUnmap(False) taxoDirectory = MMapDirectory(File(join(path, 'taxo'))) taxoDirectory.setUseUnmap(False) conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer) conf.setSimilarity(settings.similarity) mergePolicy = TieredMergePolicy() mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce) mergePolicy.setSegmentsPerTier(settings.segmentsPerTier) conf.setMergePolicy(mergePolicy) if not settings.readonly: self._indexWriter = IndexWriter(indexDirectory, conf) self._indexWriter.commit() self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize)) self._taxoWriter.commit() self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory) self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper self._facetsConfig = settings.fieldRegistry.facetsConfig self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader()) def addDocument(self, term, document): document = self._facetsConfig.build(self._taxoWriter, document) self._indexWriter.updateDocument(term, document) def deleteDocument(self, term): self._indexWriter.deleteDocuments(term) def search(self, query, filter, collector): self._indexAndTaxonomy.searcher.search(query, filter, collector) def suggest(self, query, count, field): suggestions = {} for token, startOffset, endOffset in self._analyzeToken(query): suggestWords = self._checker.suggestSimilar(Term(field, token), count, self._indexAndTaxonomy.searcher.getIndexReader()) if suggestWords: suggestions[token] = (startOffset, endOffset, [suggestWord.string for suggestWord in suggestWords]) return suggestions def termsForField(self, field, prefix=None, limit=10, **kwargs): convert = lambda term: term.utf8ToString() terms = [] termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field) if termsEnum is None: return terms iterator = termsEnum.iterator(None) if prefix: iterator.seekCeil(BytesRef(prefix)) terms.append((iterator.docFreq(), convert(iterator.term()))) bytesIterator = BytesRefIterator.cast_(iterator) try: while len(terms) < limit: term = convert(bytesIterator.next()) if prefix and not term.startswith(prefix): break terms.append((iterator.docFreq(), term)) except StopIteration: pass return terms def fieldnames(self): indexAndTaxonomy = self._indexAndTaxonomy fieldnames = [] fields = MultiFields.getFields(indexAndTaxonomy.searcher.getIndexReader()) if fields is None: return fieldnames iterator = fields.iterator() while iterator.hasNext(): fieldnames.append(iterator.next()) return fieldnames def drilldownFieldnames(self, path=None, limit=50): taxoReader = self._indexAndTaxonomy.taxoReader parentOrdinal = TaxonomyReader.ROOT_ORDINAL if path is None else taxoReader.getOrdinal(path[0], path[1:]) childrenIter = taxoReader.getChildren(parentOrdinal) names = [] while True: ordinal = childrenIter.next() if ordinal == TaxonomyReader.INVALID_ORDINAL: break names.append(taxoReader.getPath(ordinal).components[-1]) if len(names) >= limit: break return names def numDocs(self): return self._indexAndTaxonomy.searcher.getIndexReader().numDocs() def commit(self): if not self._settings.readonly: self._taxoWriter.commit() self._indexWriter.commit() self._indexAndTaxonomy.reopen() def getDocument(self, docId): return self._indexAndTaxonomy.searcher.doc(docId) def createFacetCollector(self): if not self._multithreaded: return FacetsCollector() return FacetSuperCollector(self._indexAndTaxonomy.taxoReader, self._facetsConfig, self._ordinalsReader) def facetResult(self, facetCollector): facetResult = TaxonomyFacetCounts(self._ordinalsReader, self._indexAndTaxonomy.taxoReader, self._facetsConfig, facetCollector) return Facets.cast_(facetResult) def close(self): self._indexAndTaxonomy.close() if not self._settings.readonly: self._taxoWriter.close() self._indexWriter.close() def _analyzeToken(self, token): result = [] reader = StringReader(unicode(token)) stda = self._settings.analyzer ts = stda.tokenStream("dummy field name", reader) termAtt = ts.addAttribute(CharTermAttribute.class_) offsetAtt = ts.addAttribute(OffsetAttribute.class_) try: ts.reset() while ts.incrementToken(): result.append((termAtt.toString(), offsetAtt.startOffset(), offsetAtt.endOffset())) ts.end() finally: ts.close() return result
class ImageIndexer(object): """Given an image details the indexer will get all text files, lucene them for search and retrieval.""" hash_field = FieldType() hash_field.setStored(True) hash_field.setTokenized(False) hash_field.setIndexOptions(IndexOptions.DOCS_AND_FREQS) text_field = FieldType() text_field.setStored(False) text_field.setTokenized(True) text_field.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) mime_map = MimeMapper("/var/www/bcaw/conf/mimemap.conf") def __init__(self, store_dir): self.store_dir = store_dir if not os.path.exists(store_dir): os.mkdir(store_dir, 0777) self.store = SimpleFSDirectory(Paths.get(store_dir)) self.analyzer = StandardAnalyzer() self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) self.config = IndexWriterConfig(self.analyzer) self.writer = IndexWriter(self.store, self.config) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.writer.close() def index_text(self, sha1, full_text): """Index the full text and map it to the source sha1.""" document = Document() document.add(Field("sha1", sha1, ImageIndexer.hash_field)) if full_text: document.add(Field("full_text", full_text, ImageIndexer.text_field)) self.writer.updateDocument(Term("sha1", sha1), document) else: logging.info("No text for sha1 %s", sha1) @classmethod def get_path_details(cls, temp_path, image_path): """Return the byte sequence and the full text for a given path.""" byte_sequence = ByteSequence.from_path(temp_path) extension = map_mime_to_ext(byte_sequence.mime_type, cls.mime_map) logging.debug("Assessing MIME: %s EXTENSION %s SHA1:%s", byte_sequence.mime_type, extension, byte_sequence.sha1) full_text = "" if extension is not None: try: logging.debug("Textract for SHA1 %s, extension map val %s", byte_sequence.sha1, extension) full_text = process(temp_path, extension=extension, encoding='ascii', preserveLineBreaks=True) except ExtensionNotSupported as _: logging.exception("Textract extension not supported for ext %s", extension) logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path) full_text = "N/A" except LookupError as _: logging.exception("Lookup error for encoding.") logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path) full_text = "N/A" except UnicodeDecodeError as _: logging.exception("UnicodeDecodeError, problem with file encoding") logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path) full_text = "N/A" except: logging.exception("Textract UNEXPECTEDLY failed for temp_file.") logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path) full_text = "N/A" return byte_sequence, full_text def index_path(self, temp_path, image_path): """Index the full text of the file and map it to the file's sha1 and return the derived ByteStream object and derived full text as a tuple.""" byte_sequence, full_text = self.get_path_details(temp_path, image_path) if full_text: self.index_text(byte_sequence.sha1, full_text) return byte_sequence, full_text
class LuceneHelper: def __init__(self, index_dir): self.index_dir = index_dir self.indexDir = SimpleFSDirectory(File(self.index_dir).toPath()) self.q_parser = QueryParser("", WhitespaceAnalyzer()) self.commit_max = 500000 self.__get_writer_searcher() def __get_writer_searcher(self): writerConfig = IndexWriterConfig() print(f"Codec : {writerConfig.getCodec()}") self.writer = IndexWriter(self.indexDir, writerConfig) self.reader = DirectoryReader.open(self.writer) self.searcher = IndexSearcher(self.reader) def __query(self, query_str, _max=10): if self.searcher is None: return None query_cmd = self.q_parser.parse(query_str) hits = self.searcher.search(query_cmd, _max) print( f"Found {hits.totalHits} document(s) that matched query :'{query_cmd}'" ) return hits def __count_docs(self, query_str): if self.searcher is None: return None query_cmd = self.q_parser.parse(query_str) total = self.searcher.count(query_cmd) print(f"Found {total} document(s) that matched query :'{query_cmd}'") return total def refresh_searcher(self): self.reader.close() self.reader = DirectoryReader.open(self.indexDir) self.searcher = IndexSearcher(self.reader) def index_stats(self): query_str = f"*:*" total_docs = self.__count_docs(query_str) if total_docs: print(f"There is at least total [{total_docs}] docs.") else: print("There is no index right now.") def delete_old_ttl(self): now_time = int(time.time()) # check how many docs expired ttl_query = LongPoint.newRangeQuery("ttl", 0, now_time - 1) total_docs = self.searcher.count(ttl_query) print(f"At least found {total_docs} document(s) are expired.") # delete expired docs self.writer.deleteDocuments(ttl_query) self.writer.commit() def add_doc(self, item_data): item_id = item_data['item_id'] ttl = item_data['ttl'] version = item_data.get('version', 'default') view_similar = json.dumps(item_data.get('view_similar', {})) view_prospective = json.dumps(item_data.get('view_prospective', {})) doc = Document() _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest() doc.add(StringField("id", _id, Field.Store.NO)) doc.add(LongPoint("ttl", ttl)) doc.add(StringField("version", version, Field.Store.YES)) doc.add(StringField("item_id", item_id, Field.Store.YES)) doc.add(StoredField("view_similar", view_similar)) doc.add(StoredField("view_prospective", view_prospective)) self.writer.updateDocument(Term("id", _id), doc) def commit(self): self.writer.commit() def close(self): self.writer.commit() self.reader.close() self.writer.close()