def __init__(self, root, storeDir_good, storeDir_bad, analyzer): if not os.path.exists(storeDir_good): os.mkdir(storeDir_good) if not os.path.exists(storeDir_bad): os.mkdir(storeDir_bad) store_good = SimpleFSDirectory(File(storeDir_good)) store_bad = SimpleFSDirectory(File(storeDir_bad)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) config1 = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config1.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer_good = IndexWriter(store_good, config) writer_bad = IndexWriter(store_bad, config1) self.indexDocs(root, writer_good, writer_bad) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer_good.commit() writer_good.close() writer_bad.commit() writer_bad.close() ticker.tick = False print 'done'
def index(indexdir): lucene.initVM() indexDir = SimpleFSDirectory(File(indexdir)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer()) writer = IndexWriter(indexDir, writerConfig) f = open('data/docid.documento-xml.txt') st = PorterStemmer() for i, line in enumerate(f.readlines()): id, xmltext = line.split('\t') xmltext = xmltext.rstrip('\n') xmldoc = minidom.parseString(xmltext) title = xmldoc.getElementsByTagName("TITLE") title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue authors = xmldoc.getElementsByTagName("AUTHORS") authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue abstract = xmldoc.getElementsByTagName("ABSTRACT") abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) print "indexed %s docs" % (i+1) writer.close()
def __init__(self, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.folders = { 'parsed_ctrip': ['source', 'location', 'introduction', 'score', 'img_list'], 'parsed_qunar': ['location', 'rank', 'score', 'time', 'introduction', 'img_list'], 'eic_mfw': ['location', 'introduction', 'img_list'] } self.special_tags = ['introduction'] self.files = self.__getAllPlaces() #self.readers = self.__constructReaders() self.indexDocs(writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False, is_bigram_cache_used=False, mongoObj=None): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) self.analyzer = SimpleAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searchers = [] self.searchers.append(IndexSearcher(self.reader)) if similarity == 'BM25': (self.searchers[0]).setSimilarity(BM25Similarity()) # load bigram cache self.is_bigram_cache_used = is_bigram_cache_used if is_bigram_cache_used == True: seperate_char = '/' if self.index_dir.find('/') > -1 else '\\' index_name = self.index_dir.split(seperate_char)[-1] self.index_name = index_name self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache'] self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache'] if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache_with_wikipedia'] else: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache']
def createIndexNoStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add( Field("docName", "doc", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def __init__(self, indexDir): f = Paths.get(indexDir) self._dir = SimpleFSDirectory(f) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self._writer = IndexWriter(self._dir, config)
def main(): try: indicesDestination = File(dest_path) analyzer = KeywordAnalyzer() porter_analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) a = {"code": porter_analyzer, "description": porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "literals": porter_analyzer} wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indicesDestination), config) counter = Counter() generate_indices_from_benchmark(writer, counter) writer.close() print "All jobs are done.." print str(counter) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def create_index(self, index_folder, docs_path, add_terms=False): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close()
def __init__(self, indexDir, doClear=True, computeLengthNorm=False): # if not jpype.isJVMStarted(): # lucene.initVM() lucene.getVMEnv().attachCurrentThread() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 100678)#is here? self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setRAMBufferSizeMB(256.0) # 设置自动提交的最大RAM为256MB self.config.setMaxBufferedDocs(10000) # 设置自动提交的最大Docs个数为10000 if not computeLengthNorm: sim = CustomSimilarity() self.config.setSimilarity(sim) self.path = os.path.join(INDEX_PATH, indexDir) # print self.path # path.mkdir(self.path) # if doClear: # self.clearExistingIndex() self.store = SimpleFSDirectory(File(self.path)) self.writer = IndexWriter(self.store, self.config) self.t1 = FieldType() # 域t1 self.t1.setIndexed(True) self.t1.setStored(True) self.t1.setTokenized(False) self.t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.t2 = FieldType() # 域t2 self.t2.setIndexed(True) self.t2.setStored(False) self.t2.setTokenized(True) self.t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) # print "init done" writer = IndexWriter(store, config) self.testDelete(root, writer) ticker = Ticker() print 'commit index deletion', threading.Thread(target=ticker.run).start() writer.commit() # writer.close() ticker.tick = False print 'done' end["delete"] = datetime.now() - start # writer = IndexWriter(store, config) self.testAdd(root, writer) ticker = Ticker() print 'commit index addition', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, root, storedir, isindexing=False, isBM25=True): if not os.path.exists(storedir): os.mkdir(storedir) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) if isindexing: store = SimpleFSDirectory(Paths.get(storedir)) config = IndexWriterConfig(self.analyzer) # TODO BM25 parameter tuning if isBM25: config.setSimilarity(BM25Similarity()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexer(root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done') search_dir = SimpleFSDirectory(Paths.get(storedir)) self.searcher = IndexSearcher(DirectoryReader.open(search_dir)) if isBM25: self.searcher.setSimilarity(BM25Similarity())
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) if 1 == INDEX_MODE: # APPEND config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) elif 2 == INDEX_MODE: # CREATE config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: # CREATE_OR_APPEND config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) # print "init done" writer = IndexWriter(store, config) # print "init 2 done" self.indexDocs(root, writer) ticker = Ticker() print '\ncommit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def main(): try: print "Indexing..." indexDestination = File("/Users/Falcon/Desktop/New_Indices/Stack_Q_Indices") # writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(), "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()} #KeywordAnalyzer : 필드의 전체 원문을 하나의 토큰으로 처리 wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) #PerFieldAnalyzerWrapper : 필드별로 분석기를 지정하는 기능을 지원하는 클래스 config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) config.setInfoStream(System.out) # 루씬 색인작업 디버깅 // 루크라는 도구를 사용해서 루씬 색인 관리를 할 수도 있음.. writer = IndexWriter(SimpleFSDirectory(indexDestination), config) counter = Counter() index_code_snippet(writer, counter) writer.commit() writer.close() print "Done" print str(counter) except CorruptIndexException as e: # when index is corrupt e.printStackTrace() except LockObtainFailedException as e: # when other writer is using the index e.printStackTrace() except IOException as e: # when directory can't be read/written e.printStackTrace() except SQLException as e: # when Database error occurs e.printStackTrace()
def createWriter(index_dir): indexDir = SimpleFSDirectory(File(index_dir).toPath()) writerConfig = IndexWriterConfig() print(Codec.availableCodecs()) print(f"Codec : {writerConfig.getCodec()}") writer = IndexWriter(indexDir, writerConfig) return writer
def __init__(self, root, storeDir, doIndex=False): self.analyzer = StandardAnalyzer() if not os.path.exists(storeDir): os.mkdir(storeDir) if doIndex: store = SimpleFSDirectory(Paths.get(storeDir)) analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print("commit index") threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print("done") directory = SimpleFSDirectory(Paths.get(storeDir)) self.searcher = IndexSearcher(DirectoryReader.open(directory))
def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Indexer. Parameters ---------- index_dir : string The location of lucene index mode : string The mode when opening lucene index. Available values are: 'create', open new index and overwriting over index, 'append', open existed index and append. 'create_or_append', if `index_dir` exists, 'append', else 'create' date_format : string We save datetime field as string, `date_format` specify how to format datetime into string. """ # self.store = FSDirectory.open(File(index_dir)) self.store = FSDirectory.open(Paths.get(index_dir)) # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer = StandardAnalyzer() # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config = IndexWriterConfig(self.analyzer) self.mode = mode self.date_format = date_format if mode == 'create_or_append': self.config.setOpenMode( IndexWriterConfig.OpenMode.CREATE_OR_APPEND) elif mode == 'create': self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) elif mode == 'append': self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) else: raise ValueError('Invalid mode %s', mode) self.writer = IndexWriter(self.store, self.config)
def main(): INDEX_DIR = "indexes" try: print "Indexing..." indexDir = File("/Users/Raphael/Downloads/stackoverflow1107") #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(), "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer() } wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indexDir), config) index_code_snippet(writer) writer.commit() writer.close() print "Done" except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace() except SQLException as e: #when Database error occurs e.printStackTrace()
def create_index(storage, paths): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths: for filen in os.listdir(path): text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3): doc = Document() a = i - 5 if i - 5 > 0 else 0 sentence = ' '.join(text[a:i + 5]) doc.add( Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path + filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def main(): INDEX_DIR = "indexes" try: print "Indexing..." indexDir = File("/home/ubuntu/Desktop/CoCaBu_remote/GitSearch/Indices") #writer = IndexWriter(SimpleFSDirectory(indexDir), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = KeywordAnalyzer( ) #PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT) } wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indexDir), config) index_code_snippet(writer) writer.close() except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def build_index(): lucene.initVM() # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/' post_dir = '/Users/w3/data/github/codeif_backup' index_store_dir = current_app.config['INDEX_STORE_DIR'] print post_dir print index_store_dir analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File(index_store_dir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) indexDocs(post_dir, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def main(): try: print "Indexing starts..." # indicesDestination = File("/Users/Falcon/Desktop/dyclink_2014")############################################ indicesDestination = File("/Indices/dyclink/2014") analyzer = KeywordAnalyzer( ) #전체 텍스트를 하나의 토큰으로 다룬다. (즉, Analyze 하지 않는 것과 결과적으로 동일하다.) a = { "code": JavaCodeAnalyzer(), "comments": EnglishAnalyzer(Version.LUCENE_CURRENT) } #PerFieldAnalyzerWrapper를 사용하기 위한 map 생성 (Python 에서는 Dict()) wrapper_analyzer = PerFieldAnalyzerWrapper( analyzer, a ) #http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PerFieldAnalyzerWrapper.py?revision=1757704&view=co config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) writer = IndexWriter(SimpleFSDirectory(indicesDestination), config) counter = Counter() generate_indices_from_projects(writer, counter) writer.close() print "Done" print str(counter) except CorruptIndexException as e: #when index is corrupt e.printStackTrace() except LockObtainFailedException as e: #when other writer is using the index e.printStackTrace() except IOException as e: #when directory can't be read/written e.printStackTrace()
def build_index(document_path, dir_path): lucene.initVM() index_dir = SimpleFSDirectory(Paths.get(dir_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(index_dir, config) t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(False) with open(document_path) as input_file: for line in input_file: segs = line.strip().split(" ") music_path, music_tags = segs[0], segs[1].split(",") document = Document() document.add(Field("content", " ".join(music_tags), t1)) document.add(Field("url", music_path, t2)) index_writer.addDocument(document) index_writer.close()
def createIndexStopwords(texts, route, rebuild): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los" ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) for key in texts: doc = Document() doc.add( Field("docName", key.__str__(), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("content", texts[key], Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close()
def __init__(self, index_store_path): store = NIOFSDirectory(Paths.get(index_store_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) self.writer = IndexWriter(store, config)
def getConfig(self, analyzer): self.policy = MyDeletionPolicy() config = IndexWriterConfig(analyzer) config.setIndexDeletionPolicy(self.policy) return config
def create_index(self, index_folder): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(True) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig( MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True))) writerConfig.setSimilarity(MyTFIDFSimilarity()) writerConfig.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fsDir, writerConfig) logger.info(f"{self.writer.numDocs()} docs in index") logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) tokens = self.doc_db.get_doc_tokens(doc_id) self.add_doc(doc_id, text, tokens) logger.info(f"Indexed {self.writer.numDocs()} docs.") self.writer.forceMerge(1) # to increase search performance self.writer.close()
def main(): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) config = config.setRAMBufferSizeMB(1024.0) # write data to index if not is_index_Exist: print('begin backup code files') system_flag = platform.system() cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp -f *.py %s\code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
def main(): LUCENE_INDEX_DIR = 'mmapDirectory/trec_v15_wikipedia_stemmed_v2' try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) #config=config.setRAMBufferSizeMB(1024.0) # experimental setting !! # write data to index if not is_index_Exist: #if True: print('begin backup code files') system_flag = platform.system() if system_flag == 'Windows': os.system('robocopy %s %s\code_files *.py' % (r'%cd%', LUCENE_INDEX_DIR)) else: os.system('mkdir %s/code_files' % (LUCENE_INDEX_DIR)) os.system('cp *.py %s/code_files' % (LUCENE_INDEX_DIR)) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
def __init__(self, root, storeDir, analyzer): # Create the index dir if it does not exist if not os.path.exists(storeDir): os.mkdir(storeDir) # the SimpleFSDirectory which the index will be written in store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # create a index writer # atach the index dir and config info to it writer = IndexWriter(store, config) # call the indexing procedure # indexing all the files in the directory specified by root # write the index with writer self.indexDocs(root, writer) # start a ticker ticker = Ticker() print 'commit index' threading.Thread(target=ticker.run).start() writer.commit() writer.close() # stop the ticker when the indexing procedure completes ticker.tick = False print 'Done'
def getWriter(self, store, analyzer=None, create=False): if analyzer is None: analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) return writer