def __init__(self, path, settings): self._settings = settings self._multithreaded = settings.multithreaded self._checker = DirectSpellChecker() indexDirectory = MMapDirectory(File(join(path, 'index'))) indexDirectory.setUseUnmap(False) taxoDirectory = MMapDirectory(File(join(path, 'taxo'))) taxoDirectory.setUseUnmap(False) conf = IndexWriterConfig(Version.LUCENE_4_10_0, settings.analyzer) conf.setSimilarity(settings.similarity) mergePolicy = TieredMergePolicy() mergePolicy.setMaxMergeAtOnce(settings.maxMergeAtOnce) mergePolicy.setSegmentsPerTier(settings.segmentsPerTier) conf.setMergePolicy(mergePolicy) if not settings.readonly: self._indexWriter = IndexWriter(indexDirectory, conf) self._indexWriter.commit() self._taxoWriter = DirectoryTaxonomyWriter(taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(settings.lruTaxonomyWriterCacheSize)) self._taxoWriter.commit() self._indexAndTaxonomy = IndexAndTaxonomy(settings, indexDirectory, taxoDirectory) self._readerSettingsWrapper = self._indexAndTaxonomy._readerSettingsWrapper self._facetsConfig = settings.fieldRegistry.facetsConfig self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
def publish_services(self, service_list): transformer = WSDLTransformer() current_document = 1 indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig( Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT)) writerConfig.setSimilarity(BM25Similarity()) index_writer = IndexWriter(indexDir, writerConfig) for wsdl in service_list: if self._document_expansion: #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl)))) bag_of_words = ' '.join( self._semantic_transformer.transform( transformer.transform(wsdl))) else: #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl))) bag_of_words = ' '.join(transformer.transform(wsdl)) doc = Document() doc.add( Field("content", bag_of_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO)) index_writer.addDocument(doc) current_document += 1 index_writer.close()
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) # print "init done" writer = IndexWriter(store, config) self.testDelete(root, writer) ticker = Ticker() print 'commit index deletion', threading.Thread(target=ticker.run).start() writer.commit() # writer.close() ticker.tick = False print 'done' end["delete"] = datetime.now() - start # writer = IndexWriter(store, config) self.testAdd(root, writer) ticker = Ticker() print 'commit index addition', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, root, storeDir, analyzer): # Create the index dir if it does not exist if not os.path.exists(storeDir): os.mkdir(storeDir) # the SimpleFSDirectory which the index will be written in store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # create a index writer # atach the index dir and config info to it writer = IndexWriter(store, config) # call the indexing procedure # indexing all the files in the directory specified by root # write the index with writer self.indexDocs(root, writer) # start a ticker ticker = Ticker() print 'commit index' threading.Thread(target=ticker.run).start() writer.commit() writer.close() # stop the ticker when the indexing procedure completes ticker.tick = False print 'Done'
def index(personDB, familyDB, relationDB): #config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(indexDir, config) #?#indexWriter.setRAMBufferSizeMB(50); KOLLA 256 mt = matchtext() for p in personDB.find({}, no_cursor_timeout=True): matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB) doc = Document() doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED)) doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED)) doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) #Family matchtext for f in familyDB.find(): matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB) doc = Document() doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED)) doc.add(Field('sex','FAM', StringField.TYPE_STORED)) doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) writer.commit() writer.close() return
def __init__(self, indexDir): f = Paths.get(indexDir) self._dir = SimpleFSDirectory(f) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self._writer = IndexWriter(self._dir, config)
def __init__(self, root, storeDir, analyzer, type="html"): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(Paths.get(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.load_stop_words([ "CNstopwords.txt", "ENstopwords.txt", ]) self.html2text = HTML2Text() self.html2text.ignore_links = True self.html2text.ignore_images = True type_to_index = { "html": self.index_html, "image": self.index_image, } type_to_index[type](root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Indexer. Parameters ---------- index_dir : string The location of lucene index mode : string The mode when opening lucene index. Available values are: 'create', open new index and overwriting over index, 'append', open existed index and append. 'create_or_append', if `index_dir` exists, 'append', else 'create' date_format : string We save datetime field as string, `date_format` specify how to format datetime into string. """ # self.store = FSDirectory.open(File(index_dir)) self.store = FSDirectory.open(Paths.get(index_dir)) # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer = StandardAnalyzer() # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config = IndexWriterConfig(self.analyzer) self.mode = mode self.date_format = date_format if mode == 'create_or_append': self.config.setOpenMode( IndexWriterConfig.OpenMode.CREATE_OR_APPEND) elif mode == 'create': self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) elif mode == 'append': self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) else: raise ValueError('Invalid mode %s', mode) self.writer = IndexWriter(self.store, self.config)
def build_index(document_path, dir_path): lucene.initVM() index_dir = SimpleFSDirectory(Paths.get(dir_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(index_dir, config) t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(False) with open(document_path) as input_file: for line in input_file: segs = line.strip().split(" ") music_path, music_tags = segs[0], segs[1].split(",") document = Document() document.add(Field("content", " ".join(music_tags), t1)) document.add(Field("url", music_path, t2)) index_writer.addDocument(document) index_writer.close()
def __init__(self, root, storeDir, doIndex=False): self.analyzer = StandardAnalyzer() if not os.path.exists(storeDir): os.mkdir(storeDir) if doIndex: store = SimpleFSDirectory(Paths.get(storeDir)) analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print("commit index") threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print("done") directory = SimpleFSDirectory(Paths.get(storeDir)) self.searcher = IndexSearcher(DirectoryReader.open(directory))
def build_index(): lucene.initVM() # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/' post_dir = '/Users/w3/data/github/codeif_backup' index_store_dir = current_app.config['INDEX_STORE_DIR'] print post_dir print index_store_dir analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File(index_store_dir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) indexDocs(post_dir, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.folders = { 'parsed_ctrip': ['source', 'location', 'introduction', 'score', 'img_list'], 'parsed_qunar': ['location', 'rank', 'score', 'time', 'introduction', 'img_list'], 'eic_mfw': ['location', 'introduction', 'img_list'] } self.special_tags = ['introduction'] self.files = self.__getAllPlaces() #self.readers = self.__constructReaders() self.indexDocs(writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, indexDir, doClear=True, computeLengthNorm=False): # if not jpype.isJVMStarted(): # lucene.initVM() lucene.getVMEnv().attachCurrentThread() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 100678)#is here? self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setRAMBufferSizeMB(256.0) # 设置自动提交的最大RAM为256MB self.config.setMaxBufferedDocs(10000) # 设置自动提交的最大Docs个数为10000 if not computeLengthNorm: sim = CustomSimilarity() self.config.setSimilarity(sim) self.path = os.path.join(INDEX_PATH, indexDir) # print self.path # path.mkdir(self.path) # if doClear: # self.clearExistingIndex() self.store = SimpleFSDirectory(File(self.path)) self.writer = IndexWriter(self.store, self.config) self.t1 = FieldType() # 域t1 self.t1.setIndexed(True) self.t1.setStored(True) self.t1.setTokenized(False) self.t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.t2 = FieldType() # 域t2 self.t2.setIndexed(True) self.t2.setStored(False) self.t2.setTokenized(True) self.t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def getConfig(self, analyzer): self.policy = MyDeletionPolicy() config = IndexWriterConfig(analyzer) config.setIndexDeletionPolicy(self.policy) return config
def __init__(self,root,storeDir,analyzer): # Create the index dir if it does not exist if not os.path.exists(storeDir): os.mkdir(storeDir) # the SimpleFSDirectory which the index will be written in store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer,1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # create a index writer # atach the index dir and config info to it writer = IndexWriter(store,config) # call the indexing procedure # indexing all the files in the directory specified by root # write the index with writer self.indexDocs(root,writer) # start a ticker ticker = Ticker() print 'commit index' threading.Thread(target=ticker.run).start() writer.commit() writer.close() # stop the ticker when the indexing procedure completes ticker.tick = False print 'Done'
def getWriter(self, store, analyzer=None, create=False): if analyzer is None: analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) return writer
def __init__(self, index_store_path): store = NIOFSDirectory(Paths.get(index_store_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) self.writer = IndexWriter(store, config)
def __init__(self, store_dir, analyzer, db_path): self.write_type = True self.spacy_number_types = ['DATE', 'CARDINAL', 'QUANTITY', 'MONEY', 'TIME', 'PERCENT', 'ORDINAL'] if not os.path.exists(store_dir): os.mkdir(store_dir) store = SimpleFSDirectory(Paths.get(store_dir)) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(store, config) # TODO checksum self.wiki_db = DocDB(db_path=db_path) print('Getting docs..', db_path) self.doc_ids = self.wiki_db.get_ner_doc_ids(limit=None) print('# wiki docs', len(self.doc_ids)) assert len(self.doc_ids) == 5075182 self.entity2idx = dict() self.idx2entity = dict() self.UNK = 'UNK' self.entity2idx[self.UNK] = 0 self.idx2entity[self.entity2idx[self.UNK]] = self.UNK self.entitytype2idx = dict() self.entitytype2idx[self.UNK] = 0 self.entity_dict = dict() self.num_entities_max = -1 print('Init. Done')
def testAdd(self, filepath): config = IndexWriterConfig(Version.LUCENE_CURRENT, self.getAnalyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) #True,建立新索引,False,建立增量索引 file = open(filepath) contents = unicode(file.read(), 'gbk') file.close() doc = Document() doc.add(Field("name", os.path.basename(filepath), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("path", filepath, Field.Store.YES, Field.Index.NOT_ANALYZED)) if len(contents) > 0: title = self.getTxtAttribute(contents, 'Title') author = self.getTxtAttribute(contents, 'Author') language = self.getTxtAttribute(contents, 'Language') doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Author", author, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Language", language, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("contents", contents, Field.Store.NO, Field.Index.ANALYZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) writer.close()
def main(): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) config = config.setRAMBufferSizeMB(1024.0) # write data to index if not is_index_Exist: print('begin backup code files') system_flag = platform.system() cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp -f *.py %s\code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
def main(): try: print "Indexing..." indexDestination = File("/Users/Falcon/Desktop/New_Indices/Stack_Q_Indices") # writer = IndexWriter(SimpleFSDirectory(indexDestination), StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "methods_called": KeywordAnalyzer(), "view_count": KeywordAnalyzer(), "code_hints": JavaCodeAnalyzer()} #KeywordAnalyzer : 필드의 전체 원문을 하나의 토큰으로 처리 wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) #PerFieldAnalyzerWrapper : 필드별로 분석기를 지정하는 기능을 지원하는 클래스 config = IndexWriterConfig(Version.LUCENE_CURRENT, wrapper_analyzer) config.setInfoStream(System.out) # 루씬 색인작업 디버깅 // 루크라는 도구를 사용해서 루씬 색인 관리를 할 수도 있음.. writer = IndexWriter(SimpleFSDirectory(indexDestination), config) counter = Counter() index_code_snippet(writer, counter) writer.commit() writer.close() print "Done" print str(counter) except CorruptIndexException as e: # when index is corrupt e.printStackTrace() except LockObtainFailedException as e: # when other writer is using the index e.printStackTrace() except IOException as e: # when directory can't be read/written e.printStackTrace() except SQLException as e: # when Database error occurs e.printStackTrace()
def index (cls, indexDir, taxoDir): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer from org.apache.lucene.util import Version config = IndexWriterConfig(Version.LUCENE_42, WhitespaceAnalyzer(Version.LUCENE_42)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # FacetFields is a utility class for adding facet fields to a document: facet_fields = FacetFields(taxo) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # obtain the sample facets for current document facets = categories[docNum] facetList = [CategoryPath(f) for f in facets] # NOTE: setCategoryPaths() requires an Iterable, so need to convert the # Python list in order to to pass a proper argument to setCategoryPaths. # We use java.util.Arrays (via JCC) to create a Java List: facetList = Arrays.asList(facetList) # NOTE: we could use lucene.collections here as well in order to convert our # Python list to a Java based list using the JavaList class (JavaList implements # java.util.List around a Python list instance it wraps): # from lucene.collections import JavaList # facetList = JavaList(facetList) # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # use the FacetFields utility class for adding facet fields (i.e. the categories) # to the document (and, as required, to the taxonomy index) facet_fields.addFields(doc, facetList) # finally add the document to the index iw.addDocument(doc) nDocsAdded +=1 nFacetsAdded += facetList.size() # end for # commit changes. # we commit changes to the taxonomy index prior to committing them to the search index. # this is important, so that all facets referred to by documents in the search index # will indeed exist in the taxonomy index. taxo.commit() iw.commit() # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. taxo.close() iw.close() print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
def createIndexWriter(indexDir): if not os.path.exists(indexDir): os.mkdir(indexDir) directory = FSDirectory.open(Paths.get(indexDir)) config = IndexWriterConfig(WhitespaceAnalyzer()) #config = config.setRAMBufferSizeMB(ramBufferSize) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) return IndexWriter(directory, config)
def open_writer(self): """Open IndexWriter.""" if self.writer is None: config = IndexWriterConfig(self.get_analyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.dir, config) else: raise Exception("IndexWriter is already open")
def testDelete(self, fieldName, searchString): analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) writer.deleteDocuments(Term(fieldName, searchString)) writer.close()
def __init__(self, dir, data_file): self.dir = dir self.data_file = data_file index_dir = FSDirectory.open(Paths.get(self.dir)) analyzer = StandardAnalyzer() writer_config = IndexWriterConfig(analyzer) writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(index_dir, writer_config)
def open_writer(self): """Open IndexWriter.""" if self.writer is None: config = IndexWriterConfig(self.get_version(), self.get_analyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.dir, config) else: raise Exception("IndexWriter is already open")
def _get_writer(self, analyzer=None, create=False): config = IndexWriterConfig(Version.LUCENE_CURRENT, self._analyzer) if create: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) if self._similarity is not None: config.setSimilarity(self._similarity) writer = IndexWriter(self._store, config) return writer
def __init__(self, index_dir): print("lucene:", lucene.VERSION) self.index_dir = index_dir store = SimpleFSDirectory(Paths.get(self.index_dir)) analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(store, config)
def _getLucene(self, path): directory = FSDirectory.open(Paths.get(path)) config = IndexWriterConfig(None) config.setRAMBufferSizeMB(256.0) # faster config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def deleteRec(self, pid): config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(self.indexDir, config) writer.deleteDocuments(Term('uid', pid)) writer.commit() writer.close() self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir)) return
def __init__(self, indexDir, analyzer): lucene.initVM() logger.info("RAM index") writerConfig = IndexWriterConfig(analyzer) writerConfig.setSimilarity(mySimilarity()) logger.debug('writer similarity func: {}'.format( writerConfig.getSimilarity())) writer = IndexWriter(indexDir, writerConfig) self.writer = writer
def __init__(self, root, analyzer): self.store = RAMDirectory() self.analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.store, config) self.numDocs = self.indexDocs(root, self.writer) self.writer.commit() self.writer.close()
def getLucene(path): directory = FSDirectory.open(Paths.get(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(analyzer) config.setIndexSort(Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def getLucene(path): directory = FSDirectory.open(Paths.get(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(analyzer) config.setIndexSort( Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def __init__(self): lucene.initVM() indexDir = SimpleFSDirectory(Paths.get(str(config.LUCENE_INDEXED))) analyzer = PorterStemmerAnalyzer() writerConfig = IndexWriterConfig(analyzer) writerConfig.setOpenMode( OpenMode.CREATE ) # Create a new index in the directory, removing any previously indexed documents self.index_writer = IndexWriter(indexDir, writerConfig)
def __init__(self, store_dir, context, analyzer): if not os.path.exists(store_dir): os.mkdir(store_dir) store = SimpleFSDirectory(File(store_dir)) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(store, config) self.index(context) self.complete_index()
def getWriter(self, store, analyzer=None, create=False): if analyzer is None: analyzer = WhitespaceAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 10000) config = IndexWriterConfig(analyzer) if create: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) return writer
def getLucene(path): directory = FSDirectory.open(File(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(Version.LATEST, analyzer) mergePolicy = config.getMergePolicy() sortingMergePolicy = SortingMergePolicy(mergePolicy, Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) config.setMergePolicy(sortingMergePolicy) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
class Indexer(object): def __init__(self, **kwargs): """ Initialize a new instance of the Indexer :param output: The output directory of the underlying index :param anaylzer: The overloaded analyzer to work with """ self.output = kwargs.get("root", "index") if not os.path.exists(self.output): os.mkdir(self.output) self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT)) self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.store = SimpleFSDirectory(File(self.output)) self.writer = IndexWriter(self.store, self.config) self.create_field_types() def index(self, document): """ Given a new document, add it to the index. :param document: The document to add to the indexer """ try: self.writer.addDocument(document) except Exception: logger.exception("Failed to index the supplied document") def shutdown(self): """ Shutdown the currently processing indexer. """ try: # self.writer.optimize() self.writer.close() except Exception: logger.exception("Failed to shutdown the indexer correctly") def create_field_types(self): """ Create the field types that will be used to specify what actions lucene should take on the various fields supplied to index. """ self.field_clean = FieldType() self.field_clean.setIndexed(True) self.field_clean.setStored(True) self.field_clean.setTokenized(False) self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.field_dirty = FieldType() self.field_dirty.setIndexed(True) self.field_dirty.setStored(False) self.field_dirty.setTokenized(True) self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) writer.commit() writer.close()
class WikiPageIndex(): def __init__(self, index_dir): #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g']) self.index_dir = index_dir self.directory = SimpleFSDirectory(File(self.index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) def createIndex(self): self.writer = IndexWriter(self.directory, self.config) if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) def addDocumentToIndex(self, title, text): doc = Document() doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.addDocument(doc) def closeIndex(self): self.writer.commit() self.writer.close() def searchIndex(self, queryString, field="Text", max_results=100): query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString) scoreDocs = self.searcher.search(query, max_results).scoreDocs log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString)) docs = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) log.debug(WikiPageIndex.cleanWikiText(doc.get("Text"))) #print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70])) docs.append(doc) return docs @staticmethod def cleanWikiText(text): text = text.encode('ascii', 'ignore') text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text) text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text) text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text) return text.strip()
def get_writer(index='index'): store = SimpleFSDirectory(File(index)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(store, config) return writer
def __init__(self, indexDir): if not os.path.exists(indexDir): os.mkdir(indexDir) store = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) self.writer = IndexWriter(store, config)
def dummyIndex(self): """ Create a dummy index - to avoid problems updating it """ config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(self.indexDir, config) doc = Document() doc.add(Field('uid', 'dummy', StringField.TYPE_STORED)) writer.addDocument(doc) writer.commit() writer.close() return
def __init__(self, startJVM=False): if startJVM: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.STORE_DIR = "index_dir" self.store = SimpleFSDirectory(File(self.STORE_DIR)) tmp_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer = LimitTokenCountAnalyzer(tmp_analyzer, 10000) config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.store, config)
def buildIndex(self, inputFile): analyzer = self.getAnalyzer() iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter( SimpleFSDirectory( File(self.luceneDir) ), iwconf) # read through input file and write out to lucene counter = 0 linesReadCounter = 0 with open(inputFile, 'r') as lines: linesRead = 0 for line in lines: try: linesRead+=1 if linesRead % 1000 == 0: print "%d lines read" % linesRead cui, concept = line.replace("\",\"", "\t").replace("\"", "").split("\t") concept = concept.strip() cui = cui.strip() strNorm = self.normalizeCasePunct(concept) strSorted = self.sortWords(strNorm) strStemmed = self.stemWords(strNorm) strStemmedSorted = self.stemWords(strSorted) fdoc = Document() counter +=1 fid = counter fdoc.add( Field("id", unicode(fid), Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("cui", cui, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str", concept, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_norm", strNorm, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_sorted", strSorted, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_stemmed", strStemmed, Field.Store.YES, Field.Index.NOT_ANALYZED)) fdoc.add( Field("str_stemmedSorted", strStemmedSorted, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(fdoc) if fid % 1000 == 0: writer.commit() except: "Skipping line: %s" % line writer.commit() writer.close()
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1000)#1048576 config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def rollback(collection_name): if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) writer.rollback() writer.close()
def run(self): print "Booting lucene driver worker...." lucene.initVM() self.fieldType1 = FieldType() self.fieldType1.setIndexed(True) self.fieldType1.setStored(False) self.fieldType1.setTokenized(True) self.fieldType2 = FieldType() self.fieldType2.setIndexed(True) self.fieldType2.setStored(True) self.fieldType2.setTokenized(False) while(True): data = self.queue.get() da = data[1] response = None try: self.fil = File(da['data']['indexdir']) self.d = NIOFSDirectory(self.fil) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.conf = IndexWriterConfig( Version.LUCENE_CURRENT, self.analyzer) response = getattr(self, da['action'])(da['data']) self.d.close() except Exception as e: print e if response is None: response = {} self.ret[data[0]] = response
def delete(primary_keys_map,collection_name,todelete,commit=False): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs=json.loads(todelete) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) ireader=IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents according to primary keys query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) a=writer.deleteDocuments(query) if commit==True: writer.commit() writer.close() return 000;
def __init__(self, destination_directory, analyzer): if not os.path.exists(destination_directory): os.mkdir(destination_directory) store = SimpleFSDirectory(File(destination_directory)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.tweetIndexer(writer) ticker = Ticker() threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, fileRoot, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setSimilarity(similarities.BM25Similarity()) #Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(fileRoot, writer) print 'commit index', writer.commit() writer.close() print 'done'
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) # Store index files in the file syste. try NIOFSDirectory analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) # maxTokenCount=1048576, this analyzer limit the number of tokens per field, not necessary for indexing MEDLINE config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def createIndexWriter(self, actual_dir, max_field_length=20000000): """ Returns an IndexWriter object created for the actual_dir specified """ ensureDirExists(actual_dir) index = SimpleFSDirectory(File(actual_dir)) analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) writerConfig=IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer) similarity=FieldAgnosticSimilarity() writerConfig.setSimilarity(similarity) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) ## res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length)) res= IndexWriter(index, writerConfig) res.deleteAll() return res
def index_wiki(wiki_xmlfile, index_directory_name): lucene.initVM() # Initialize index directory and analyzer. version = Version.LUCENE_CURRENT store = FSDirectory.open(File(index_directory_name)) analyzer = StandardAnalyzer(version) # Creates config file. config = IndexWriterConfig(version, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) # Set document content field type. content_fieldtype = FieldType() content_fieldtype.setIndexed(True) content_fieldtype.setStored(True) content_fieldtype.setTokenized(True) content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document title field type. title_fieldtype = FieldType() title_fieldtype.setIndexed(True) title_fieldtype.setStored(True) title_fieldtype.setTokenized(True) title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document url field type. url_fieldtype = FieldType() url_fieldtype.setIndexed(True) url_fieldtype.setStored(True) url_fieldtype.setTokenized(False) url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for xmldoc in wikicorpusxml((wiki_xmlfile)): content = xmldoc.partition('>')[2].partition('<')[0].strip() title = xmldoc.partition(' title="')[2].partition('"')[0].strip() url = xmldoc.partition(' url="')[2].partition('"')[0].strip() doc = Document() doc.add(Field("contents", content, content_fieldtype)) doc.add(Field("title", title, title_fieldtype)) doc.add(Field("url", url, url_fieldtype)) writer.addDocument(doc) writer.commit() writer.close()
def getFreqVectorFromText(self, text): # Initialization of Java Virtual Machine with Lucene vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = "res/index" stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(indexDir)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) rebuild = True if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) doc = Document() doc.add(Field("docName", 'url', Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("content", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close() ireader = IndexReader.open(directory) freqVector = [] docVector = ireader.getTermVector(0, "content") termsEnum = docVector.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() freq = termsEnum.totalTermFreq() freqVector.append((text, freq)) freqVector = sorted(freqVector, key=itemgetter(1), reverse=True) self.vector = list() self.freqs = list() for el in freqVector: self.vector.append(el[0]) self.freqs.append(el[1])