def create_index(self, index_folder): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(True) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig( MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True))) writerConfig.setSimilarity(MyTFIDFSimilarity()) writerConfig.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fsDir, writerConfig) logger.info(f"{self.writer.numDocs()} docs in index") logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) tokens = self.doc_db.get_doc_tokens(doc_id) self.add_doc(doc_id, text, tokens) logger.info(f"Indexed {self.writer.numDocs()} docs.") self.writer.forceMerge(1) # to increase search performance self.writer.close()
def publish_services(self, service_list): transformer = WSDLTransformer() current_document = 1 indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig( Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT)) writerConfig.setSimilarity(BM25Similarity()) index_writer = IndexWriter(indexDir, writerConfig) for wsdl in service_list: if self._document_expansion: #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl)))) bag_of_words = ' '.join( self._semantic_transformer.transform( transformer.transform(wsdl))) else: #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl))) bag_of_words = ' '.join(transformer.transform(wsdl)) doc = Document() doc.add( Field("content", bag_of_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO)) index_writer.addDocument(doc) current_document += 1 index_writer.close()
def __init__(self, path, settings): self._settings = settings self._multithreaded = settings.multithreaded self._checker = DirectSpellChecker() indexDirectory = MMapDirectory(File(join(path, 'index'))) indexDirectory.setUseUnmap(False) taxoDirectory = MMapDirectory(File(join(path, 'taxo'))) taxoDirectory.setUseUnmap(False) conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer) conf.setSimilarity(settings.similarity) mergePolicy = TieredMergePolicy() mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce) mergePolicy.setSegmentsPerTier(settings.segmentsPerTier) conf.setMergePolicy(mergePolicy) if not settings.readonly: self._indexWriter = IndexWriter(indexDirectory, conf) self._indexWriter.commit() self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize)) self._taxoWriter.commit() self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory) self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper self._facetsConfig = settings.fieldRegistry.facetsConfig self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
def __init__(self, root, storedir, isindexing=False, isBM25=True): if not os.path.exists(storedir): os.mkdir(storedir) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) if isindexing: store = SimpleFSDirectory(Paths.get(storedir)) config = IndexWriterConfig(self.analyzer) # TODO BM25 parameter tuning if isBM25: config.setSimilarity(BM25Similarity()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexer(root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done') search_dir = SimpleFSDirectory(Paths.get(storedir)) self.searcher = IndexSearcher(DirectoryReader.open(search_dir)) if isBM25: self.searcher.setSimilarity(BM25Similarity())
def _get_writer(self, analyzer=None, create=False): config = IndexWriterConfig(Version.LUCENE_CURRENT, self._analyzer) if create: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) if self._similarity is not None: config.setSimilarity(self._similarity) writer = IndexWriter(self._store, config) return writer
def __init__(self, indexDir, analyzer): lucene.initVM() logger.info("RAM index") writerConfig = IndexWriterConfig(analyzer) writerConfig.setSimilarity(mySimilarity()) logger.debug('writer similarity func: {}'.format( writerConfig.getSimilarity())) writer = IndexWriter(indexDir, writerConfig) self.writer = writer
def __init__(self): indexDir = RAMDirectory() analyzer = SmartChineseAnalyzer() writerConfig = IndexWriterConfig(analyzer) # create new directory, remove previously indexed documents writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writerConfig.setSimilarity(mySimilarity()) logger.debug('search similarity:{}'.format( writerConfig.getSimilarity())) self.indexDir = indexDir self.writer = IndexWriter(indexDir, writerConfig)
def __init__(self, lang): lucene.initVM() if lang == 'zh': logger.info("index directory:{}".format(config.IDX_SSQA)) indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_SSQA))) analyzer = SmartChineseAnalyzer() else: raise ValueError( 'lang should be "zh" or "en", {} is invalid!'.format(lang)) writerConfig = IndexWriterConfig(analyzer) writerConfig.setSimilarity(mySimilarity()) logger.debug('writer similarity func: {}'.format( writerConfig.getSimilarity())) writer = IndexWriter(indexDir, writerConfig) self.writer = writer
def __init__(self, storeDir, similarity=None): """Constructor storeDir -- path where to save the index""" if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(Paths.get(storeDir)) self.dir = store analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) if similarity is not None: config.setSimilarity(similarity) writer = IndexWriter(store, config) self.writer = writer
def createIndexWriter(self, actual_dir, max_field_length=20000000): """ Returns an IndexWriter object created for the actual_dir specified """ ensureDirExists(actual_dir) index = SimpleFSDirectory(File(actual_dir)) analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) writerConfig=IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer) similarity=FieldAgnosticSimilarity() writerConfig.setSimilarity(similarity) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) ## res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length)) res= IndexWriter(index, writerConfig) res.deleteAll() return res
def __init__(self, fileRoot, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setSimilarity(similarities.BM25Similarity()) #Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(fileRoot, writer) print 'commit index', writer.commit() writer.close() print 'done'
def createIndexWriter(self, actual_dir, max_field_length=20000000): """ Returns an IndexWriter object created for the actual_dir specified """ ensureDirExists(actual_dir) index = SimpleFSDirectory(File(actual_dir)) analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) writerConfig = IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer) similarity = FieldAgnosticSimilarity() writerConfig.setSimilarity(similarity) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) ## res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length)) res = IndexWriter(index, writerConfig) res.deleteAll() return res
class Indexer(object): # 建立索引 def __init__(self, indexDir, doClear=True, computeLengthNorm=False): # if not jpype.isJVMStarted(): # lucene.initVM() lucene.getVMEnv().attachCurrentThread() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 100678)#is here? self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setRAMBufferSizeMB(256.0) # 设置自动提交的最大RAM为256MB self.config.setMaxBufferedDocs(10000) # 设置自动提交的最大Docs个数为10000 if not computeLengthNorm: sim = CustomSimilarity() self.config.setSimilarity(sim) self.path = os.path.join(INDEX_PATH, indexDir) # print self.path # path.mkdir(self.path) # if doClear: # self.clearExistingIndex() self.store = SimpleFSDirectory(File(self.path)) self.writer = IndexWriter(self.store, self.config) self.t1 = FieldType() # 域t1 self.t1.setIndexed(True) self.t1.setStored(True) self.t1.setTokenized(False) self.t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.t2 = FieldType() # 域t2 self.t2.setIndexed(True) self.t2.setStored(False) self.t2.setTokenized(True) self.t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) def clearExistingIndex(self): # 删除索引? indexdir = self.path for thefile in os.listdir(indexdir): filepath = os.path.join(indexdir, thefile) try: if os.path.isfile(filepath): os.unlink(filepath) except Exception, e: logger.error("Delete file %s failed: %s", filepath, str(e))
def __init__(self): wikidir = './wiki-pages-text' indexdir = './IndexFiles.index' lucene.initVM(vmargs=['-Djava.awt.headless=true']) if not os.path.exists(indexdir): os.mkdir(indexdir) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) store = SimpleFSDirectory(Paths.get(indexdir)) config = IndexWriterConfig(self.analyzer) config.setSimilarity(BM25Similarity()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexer(wikidir, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
def __init__(self, storeDir, aWrapper): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) aWrapper = LimitTokenCountAnalyzer(aWrapper, 1048576) bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75. config = IndexWriterConfig(Version.LUCENE_CURRENT, aWrapper) config.setSimilarity(bm25Sim) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexTable(writer) ticker = Ticker() print 'commit index' threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
class IndexingEngine(): def __init__(self): self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers() ############################# Writer Configurattion ##################################### map = HashMap() map.put('name', self.mAnalyzers['name']) map.put('parent', self.mAnalyzers['parent']) map.put('content', self.mAnalyzers['default']) map.put('id', self.mAnalyzers['id']) analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map) self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper) self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode) if settings.ADMINS_ENGINE.mSimilarity != None: self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity) ######################################################################################## directory = SimpleFSDirectory(File(self.mIndexDirectory)) self.mIndexWriter = IndexWriter(directory, self.mWriterConfig) ############################# FieldType Prepration ##################### nameField = FieldType() nameField.setIndexed(True) nameField.setStored(True) nameField.setTokenized(True) nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) parentField = FieldType() parentField.setIndexed(True) parentField.setStored(True) parentField.setTokenized(True) parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) contentField = FieldType() contentField.setIndexed(True) contentField.setStored(True) contentField.setTokenized(True) contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) idField = FieldType() idField.setIndexed(True) idField.setStored(True) idField.setTokenized(False) idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) self.mFieldTypes = { 'name' : nameField, 'parent' : parentField, 'content' : contentField, 'id' : idField } ####################################################################### self.mLog = "" def indexing(self, root = settings.ADMINS_ENGINE.mDocumentDirectory, parent = [], docID = 1, parentID = 0, id = 0): realPath = os.path.abspath(root) for i in os.listdir(realPath): path = os.path.join(realPath, i) if os.path.isfile(path): #index this file doc = Document() doc.add(Field('name', ("%s %s" % (' '.join(parent), i)).strip(), self.mFieldTypes['name'])) doc.add(Field('parent', ' '.join(parent), self.mFieldTypes['parent'])) doc.add(Field('id', str(docID), self.mFieldTypes['id'])) doc.add(Field('parentID', str(parentID), self.mFieldTypes['id'])) fd = open(path, 'r') content = fd.read() fd.close() if len(content) > 0: doc.add(Field('content', content, self.mFieldTypes['content'])) self.mIndexWriter.addDocument(doc) ##################### Logging ############################## if IS_DEBUG: nameDebug = AnalyzerDebug.debug(self.mAnalyzers['name'], ("%s %s" % (' '.join(parent), i)).strip()) parentDebug = AnalyzerDebug.debug(self.mAnalyzers['parent'], ' '.join(parent)) contentDebug = AnalyzerDebug.debug(self.mAnalyzers['default'], content) self.mLog = self.mLog + ( "File %s\n {name - %s}: %s\n {parent - %s}: %s\n {content}: %s\n\n" % (path, docID, nameDebug, parentID, parentDebug, contentDebug) ) docID = docID + 1 ################### index sub commands if os.path.isdir(path + ".sub"): parent.append(i) docID = self.indexing(path + ".sub", parent, docID, docID - 1, id + 1) parent.pop() if id == 0: self.mIndexWriter.commit() self.mIndexWriter.close() if IS_DEBUG: loggingBot = LoggingBot(self.mLog, settings.ADMINS_ENGINE.getIndexingLogQueue()) loggingBot.start() self.mLog = "" return docID
def __init__(self, lang, dataset, analyzer, index_path=None, data_path=None, ram_size=2048): """ Returns scored documents in multiple languages. Parameters: dataset (str): ['mlqa_dev', 'mlqa_test', 'wiki'] lang (str): ['en', 'es', 'de'] anlyzer (str): ['en', 'es', 'de', 'standard'] ram_size (int): Size of memory used while indexing Returns: """ super().__init__() idxdir = self.get_index(lang, dataset, index_path) self.mlqa = True if dataset == 'mlqa_dev': self.dataset = MLQADataset('dev', lang, lang, data_path) elif dataset == 'mlqa_test': self.dataset = MLQADataset('test', lang, lang, data_path) elif dataset == 'wiki': self.mlqa = False self.dataset = Wiki(lang, data_path) else: raise RuntimeError("No dataloader for {}".format(dataset)) # stores index files, poor concurency try NIOFSDirectory instead store = SimpleFSDirectory(Paths.get(idxdir)) # limit max. number of tokens per document. # analyzer will not consume more tokens than that #analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) # configuration for index writer config = IndexWriterConfig(analyzers[analyzer]()) # creates or overwrites index config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # setting similarity BM25Similarity(k1=1.2,b=0.75) similarity = BM25Similarity(self.k1, self.b) config.setSimilarity(similarity) config.setRAMBufferSizeMB(float(ram_size)) # create index writer self.writer = IndexWriter(store, config) self.ftdata = FieldType() self.ftmeta = FieldType() # IndexSearcher will return value of the field self.ftdata.setStored(True) self.ftmeta.setStored(True) # will be analyzed by Analyzer self.ftdata.setTokenized(True) self.ftmeta.setTokenized(False) # what informations are stored (probabli DOCS would be sufficient) # DOCS: Only documents are indexed: term frequencies and positions are omitted. # Phrase and other positional queries on the field will throw an exception, # and scoring will behave as if any term in the document appears only once. # DOCS_AND_FREQS: Only documents and term frequencies are indexed: positions are # omitted. This enables normal scoring, except Phrase and other positional # queries will throw an exception. # DOCS_AND_FREQS_AND_POSITIONS: Indexes documents, frequencies and positions. # This is a typical default for full-text search: full scoring is enabled # and positional queries are supported. self.ftdata.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.ftmeta.setIndexOptions(IndexOptions.DOCS) # instantiate some reusable objects # TODO: create document, add fields then change only field value and # re-add document self.doc = Document() # Id cannot be reused because there is multiple values # I could store list of fields and add one if its not enough #self.fieldId = Field("id", "dummy", self.ftmeta) self.fieldTitle = Field("title", "dummy", self.ftdata) self.doc.add(self.fieldTitle) self.fieldContext = Field("context", "dummy", self.ftdata) self.doc.add(self.fieldContext) self.fieldIds = [Field("id", "dummy", self.ftmeta)]