def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) if 1 == INDEX_MODE: # APPEND config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) elif 2 == INDEX_MODE: # CREATE config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: # CREATE_OR_APPEND config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) # print "init done" writer = IndexWriter(store, config) # print "init 2 done" self.indexDocs(root, writer) ticker = Ticker() print '\ncommit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, root, storedir, isindexing=False, isBM25=True): if not os.path.exists(storedir): os.mkdir(storedir) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) if isindexing: store = SimpleFSDirectory(Paths.get(storedir)) config = IndexWriterConfig(self.analyzer) # TODO BM25 parameter tuning if isBM25: config.setSimilarity(BM25Similarity()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexer(root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done') search_dir = SimpleFSDirectory(Paths.get(storedir)) self.searcher = IndexSearcher(DirectoryReader.open(search_dir)) if isBM25: self.searcher.setSimilarity(BM25Similarity())
def __init__(self, root, storeDir, analyzer, type="html"): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(Paths.get(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.load_stop_words([ "CNstopwords.txt", "ENstopwords.txt", ]) self.html2text = HTML2Text() self.html2text.ignore_links = True self.html2text.ignore_images = True type_to_index = { "html": self.index_html, "image": self.index_image, } type_to_index[type](root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
def __init__(self, root, storeDir, doIndex=False): self.analyzer = StandardAnalyzer() if not os.path.exists(storeDir): os.mkdir(storeDir) if doIndex: store = SimpleFSDirectory(Paths.get(storeDir)) analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print("commit index") threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print("done") directory = SimpleFSDirectory(Paths.get(storeDir)) self.searcher = IndexSearcher(DirectoryReader.open(directory))
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) # print "init done" writer = IndexWriter(store, config) self.testDelete(root, writer) ticker = Ticker() print 'commit index deletion', threading.Thread(target=ticker.run).start() writer.commit() # writer.close() ticker.tick = False print 'done' end["delete"] = datetime.now() - start # writer = IndexWriter(store, config) self.testAdd(root, writer) ticker = Ticker() print 'commit index addition', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def build_index(): lucene.initVM() # post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/' post_dir = '/Users/w3/data/github/codeif_backup' index_store_dir = current_app.config['INDEX_STORE_DIR'] print post_dir print index_store_dir analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) store = SimpleFSDirectory(File(index_store_dir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) indexDocs(post_dir, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, root, storeDir_good, storeDir_bad, analyzer): if not os.path.exists(storeDir_good): os.mkdir(storeDir_good) if not os.path.exists(storeDir_bad): os.mkdir(storeDir_bad) store_good = SimpleFSDirectory(File(storeDir_good)) store_bad = SimpleFSDirectory(File(storeDir_bad)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) config1 = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config1.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer_good = IndexWriter(store_good, config) writer_bad = IndexWriter(store_bad, config1) self.indexDocs(root, writer_good, writer_bad) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer_good.commit() writer_good.close() writer_bad.commit() writer_bad.close() ticker.tick = False print 'done'
def getWriter(self, directory=None, analyzer=None, open_mode=None, similarity=None, maxBufferedDocs=None, mergePolicy=None): if analyzer is None: analyzer = LimitTokenCountAnalyzer( WhitespaceAnalyzer(self.TEST_VERSION), 10000) config = self.getConfig(analyzer) if open_mode is None: open_mode = IndexWriterConfig.OpenMode.CREATE config.setOpenMode(open_mode) if similarity is not None: config.setSimilarity(similarity) if maxBufferedDocs is not None: config.setMaxBufferedDocs(maxBufferedDocs) if mergePolicy is not None: config.setMergePolicy(mergePolicy) if directory is None: directory = self.directory return IndexWriter(directory, config)
def __init__(self, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.folders = { 'parsed_ctrip': ['source', 'location', 'introduction', 'score', 'img_list'], 'parsed_qunar': ['location', 'rank', 'score', 'time', 'introduction', 'img_list'], 'eic_mfw': ['location', 'introduction', 'img_list'] } self.special_tags = ['introduction'] self.files = self.__getAllPlaces() #self.readers = self.__constructReaders() self.indexDocs(writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def index(): # Initialize lucene and the JVM # lucene.initVM() GLOBALDIRECTORY = getDirectory() #Indexwriter config analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, tokenCount) config = IndexWriterConfig(analyzer) writer = IndexWriter(GLOBALDIRECTORY, config) fileNames = getTxtFile(textfileDirectory) #creates document for each tweet fileNames = getTxtFile(textfileDirectory) #creates document for each tweet for file in fileNames: data = getData(file) for tweets in data: if 'text' in tweets: doc = createDocument_tweet(tweets) writer.addDocument(doc) # add the document to IndexWriter print file print "\nNumber of indexed documents: %d" % writer.numDocs( ) #number of documents indexed for testing writer.close() print "Indexing done!\n" print "------------------------------------------------------" return GLOBALDIRECTORY
def __init__(self, root, storeDir, analyzer): # Create the index dir if it does not exist if not os.path.exists(storeDir): os.mkdir(storeDir) # the SimpleFSDirectory which the index will be written in store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # create a index writer # atach the index dir and config info to it writer = IndexWriter(store, config) # call the indexing procedure # indexing all the files in the directory specified by root # write the index with writer self.indexDocs(root, writer) # start a ticker ticker = Ticker() print 'commit index' threading.Thread(target=ticker.run).start() writer.commit() writer.close() # stop the ticker when the indexing procedure completes ticker.tick = False print 'Done'
def testDelete(self, fieldName, searchString): analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) writer.deleteDocuments(Term(fieldName, searchString)) writer.close()
def searchWithTerm(self, query): """ Search an index with facets by using simple term query return a list of FacetResult instances """ analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) query = QueryParser("content", analyzer).parse(query) return self.searchWithQuery(query)
def __init__(self, index_dir): print("lucene:", lucene.VERSION) self.index_dir = index_dir store = SimpleFSDirectory(Paths.get(self.index_dir)) analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(store, config)
def __init__(self): indexdir = './IndexFiles.index' lucene.initVM(vmargs=['-Djava.awt.headless=true']) search_dir = SimpleFSDirectory(Paths.get(indexdir)) self.searcher = IndexSearcher(DirectoryReader.open(search_dir)) self.searcher.setSimilarity(BM25Similarity()) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) self.lemmatizer = nltk.stem.WordNetLemmatizer()
def __init__(self, root, analyzer): self.store = RAMDirectory() self.analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.store, config) self.numDocs = self.indexDocs(root, self.writer) self.writer.commit() self.writer.close()
def getWriter(store, analyzer=None, create=False): if analyzer is None: analyzer = WhitespaceAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 10000000) config = IndexWriterConfig(analyzer) if create: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) print(store, config) writer = IndexWriter(store, config) return writer
def index_files(): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() BASE_DIR = path.dirname(path.abspath(sys.argv[0])) INPUT_DIR = BASE_DIR + "/input/" INDEX_DIR = BASE_DIR + "/lucene_index/" NoT = 100000 # Number of Tokens print "------------------------------------------------------" print "PyLucene Demo started (lucene_demo.py)" print "Python version: %d.%d.%d" % ( sys.version_info.major, sys.version_info.minor, sys.version_info.micro) print 'Lucene version:', lucene.VERSION print "------------------------------------------------------\n" # lucene.initVM() # directory = RAMDirectory() index_path = Paths.get(INDEX_DIR) directory = SimpleFSDirectory(index_path) analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, NoT) config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) print "Number of indexed documents: %d\n" % writer.numDocs() for input_file in listdir(INPUT_DIR): # iterate over all input files print "Current file:", input_file if input_file.endswith(".json"): with open(INPUT_DIR + input_file) as f: for line in f: # doc = create_document(line, input_file) # call the create_document function o = json.loads(line) doc = Document() # create a new document doc.add(TextField("filename", input_file, Field.Store.YES)) # print file doc.add( TextField("username", o['user']['screen_name'], Field.Store.YES)) # print "username: "******"text", o['text'], Field.Store.YES)) # print "text: " + o['text'] if o['user']['location']: doc.add( TextField("location", o['user']['location'], Field.Store.YES)) # print "location: " + o['user']['location'] doc.add(TextField("time", o['created_at'], Field.Store.YES)) writer.addDocument( doc) # add the document to the IndexWriter print "\nNumber of indexed documents: %d" % writer.numDocs() writer.close() print "Finished\n" print "-----------------------------------------------------"
def _initialize(self): if not os.path.exists(self.path): os.mkdir(self.path) self._analyzer = LimitTokenCountAnalyzer(self._analyzer, 1048576) self._store = SimpleFSDirectory(Paths.get(self.path)) self._config = IndexWriterConfig(self._analyzer) self._config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self._writer = IndexWriter(self._store, self._config) self._set_fieldtypes()
def getWriter(store, analyzer=None, create=False): if analyzer is None: analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 10000) config = IndexWriterConfig(analyzer) # config.setInfoStream(PrintStreamInfoStream(System.out)) if create: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) return writer
def getWriter(self, store, analyzer=None, create=False): if analyzer is None: analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 10000) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) if create: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) return writer
def __init__(self, docDir, indexDir, analyzer): #set index dir if not os.path.exists(indexDir): os.makedirs(indexDir) self.indexDir = SimpleFSDirectory(Paths.get(indexDir)) self.docDir = docDir self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) writerConfig = IndexWriterConfig(self.analyzer) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.indexDir, writerConfig) self.indexing()
def __init__(self, indexDir, root="testdocs"): # create and open an index writer indexDir = FSDirectory.open(Paths.get(indexDir)) # TODO make appropriate analyzer add to config analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) self.authorcount = 0 self.titlecount = 0 self.errorcount = 0 self.indexDocs(root, iw)
def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice): analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) # True,建立新索引,False,建立增量索引 noIndexedString = FieldType() noIndexedString.setTokenized(False) noIndexedString.setIndexed(False) noIndexedString.setStored(True) try: print "adding", goodname goodname_s = unicode(goodname, 'utf8') seg_list_good = jieba.cut(goodname_s, cut_all=False) goodname_s = " ".join(seg_list_good) # 默认模式 shopname_s = unicode(shopname, 'utf8') seg_list_shop = jieba.cut(shopname_s, cut_all=False) shopname_s = " ".join(seg_list_shop) # 默认模式 shopnameField = Field("shopName", shopname, noIndexedString) shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO) goodnameField = Field("goodName", goodname, noIndexedString) goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO) salenumField = IntField("saleNum", salenum, Field.Store.YES) priceField = DoubleField("price", price, Field.Store.YES) urlField = Field("url", url, noIndexedString) pictureField = StringField("pictureName", picturename, Field.Store.YES) commentField = Field("comments", comment, noIndexedString) historyPriceField = Field("historyPrice", historyprice, noIndexedString) doc = Document() doc.add(shopnameField) doc.add(shopnameField_s) doc.add(goodnameField) doc.add(goodnameField_s) doc.add(salenumField) doc.add(priceField) doc.add(urlField) doc.add(pictureField) doc.add(commentField) doc.add(historyPriceField) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def __init__(self, sentences, base_dir): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) except: pass analyzer = StandardAnalyzer() storeDir = os.path.join(base_dir, INDEX_DIR) if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(Paths.get(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexsents(sentences, writer)
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(Paths.get(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def __init__(self, storeDir, analyzer, function): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) eval("self." + function + "(writer)") ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print '索引建立完成.'
def _index_files(storeDir, indexFile): jieba.initialize() store = SimpleFSDirectory(File(storeDir)) analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) _index_docs(indexFile, writer) print('commit index') writer.commit() writer.close() print('done')
def get_writer(self, store_dir, analyzer): ''' Generate an `IndexWriter` according to the parameters. Input: `store_dir`: directory to store the Lucene index `analyzer`: analyzer used to analyze the docs Output: `IndexWriter` with the correct parameters ''' # Initialize the `store_dir` if not os.path.exists(store_dir): os.mkdir(store_dir) store = SimpleFSDirectory(Paths.get(store_dir)) # Generate an analyzer according to parameters analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) return IndexWriter(store, config)
def __init__(self, dataFilePath, storeDir): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(store, config) self.indexDocs(dataFilePath, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'