def init_lucene_search(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION print 'Index ', INDEX_DIR base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) # current dir directory = SimpleFSDirectory(File(INDEX_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet()) return searcher, analyzer
def __init__(self, path=INDEX_DIR): # 初始化lucene,设置好analyzer、reader、searcher和分词器 lucene.initVM() self.indir = SimpleFSDirectory(Paths.get(path)) self.analyzer = SmartChineseAnalyzer() self.reader = DirectoryReader.open(self.indir) self.searcher = IndexSearcher(self.reader)
def searchResults(command): STORE_DIR = "./index_2" directory = SimpleFSDirectory(Paths.get(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SmartChineseAnalyzer() num, results = run(searcher, analyzer, command) del searcher return results
def __init__(self): indexDir = RAMDirectory() analyzer = SmartChineseAnalyzer() writerConfig = IndexWriterConfig(analyzer) # create new directory, remove previously indexed documents writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writerConfig.setSimilarity(mySimilarity()) logger.debug('search similarity:{}'.format( writerConfig.getSimilarity())) self.indexDir = indexDir self.writer = IndexWriter(indexDir, writerConfig)
def index(request): vm_env = lucene.getVMEnv() if (vm_env): vm_env.attachCurrentThread() base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory( Paths.get("/Users/css/nlplearn/yuliao/index1")) searcher = IndexSearcher(DirectoryReader.open(directory)) ana = SmartChineseAnalyzer() command = "你好" query = QueryParser("question", ana).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs tmpdata = scoreDocs[0] doc = searcher.doc(tmpdata.doc) del searcher tmpresult = doc.get("answer").encode('utf-8') # print tmpresult response = HttpResponse(tmpresult) # response = HttpResponse('helloworld2') return response else: lucene.initVM(vmargs=['-Djava.awt.headless=true']) #lucene.initVM() base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory( Paths.get("/Users/css/nlplearn/yuliao/index1")) searcher = IndexSearcher(DirectoryReader.open(directory)) ana = SmartChineseAnalyzer() command = "你好" query = QueryParser("question", ana).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs tmpdata = scoreDocs[0] doc = searcher.doc(tmpdata.doc) del searcher tmpresult = doc.get("answer").encode('utf-8') #print tmpresult response = HttpResponse(tmpresult) #response = HttpResponse('helloworld2') return response
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) # analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) # use smart chinese analyzer analyzer = SmartChineseAnalyzer( Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet()) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def search(command): STORE_DIR = "index" # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = MMapDirectory(Paths.get(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SmartChineseAnalyzer() ans = run(searcher, analyzer, command) del searcher return ans # vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) # for y in search('二三四五'): # print(y)
def __init__(self, lang): lucene.initVM() if lang == 'zh': indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_SSQA))) analyzer = SmartChineseAnalyzer() else: raise ValueError( 'lang should be "zh" or "en", {} is invalid!'.format(lang)) self.reader = DirectoryReader.open(indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = analyzer logger.debug('search similarity func: {}'.format( self.searcher.getSimilarity()))
def __init__(self, lang): lucene.initVM() if lang == 'zh': logger.info("index directory:{}".format(config.IDX_SSQA)) indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_SSQA))) analyzer = SmartChineseAnalyzer() else: raise ValueError( 'lang should be "zh" or "en", {} is invalid!'.format(lang)) writerConfig = IndexWriterConfig(analyzer) writerConfig.setSimilarity(mySimilarity()) logger.debug('writer similarity func: {}'.format( writerConfig.getSimilarity())) writer = IndexWriter(indexDir, writerConfig) self.writer = writer
def __init__(self, path): print('Searcher initialized...') self.path = path self.analyzer = SmartChineseAnalyzer() # self.analyzer = WhitespaceAnalyzer(Version.LATEST) self.reader = DirectoryReader.open( SimpleFSDirectory(Paths.get(self.path))) self.searcher = IndexSearcher(self.reader) self.thu = thulac.thulac(deli='/') file = Path('w2v.model') if file.is_file(): print('Model was already trained...loading model') self.w2v_model = Word2Vec.load('w2v.model') else: self.model_train() print('Model trained...')
def __init__(self, Lid, db_path=config.DB_SSQA): lucene.initVM() self.db = SSQA_DB(db_path) lesson_str = self.db.get_lesson_str(Lid) parags = str_lesson2parags(lesson_str) # Index a Lesson myIndexer = _ChineseRamIndexer() myIndexer.index_lesson(parags) myIndexer.close() self.reader = DirectoryReader.open(myIndexer.indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = SmartChineseAnalyzer() logger.debug('search similarity:{}'.format( self.searcher.getSimilarity()))
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) # analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) # use smart chinese analyzer analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet()) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print 'done'
def index_and_search_sentence(list_paragraph, question): ramDir = RAMDirectory() analyzer = SmartChineseAnalyzer() myIndexer = SSQA_S_Indexer(ramDir, analyzer) try: sent_num = 0 logger.info("Start indexing sentences...") for paragraph in tqdm(list_paragraph): sentences = re.split('#', paragraph) for sent in sentences: myIndexer.add(sent) sent_num += 1 logger.info("Indexed {} sentences.".format(sent_num)) myIndexer.close() mySearcher = SSQA_S_Searcher(ramDir, analyzer) ret_sents = mySearcher.search(question, 1) return ret_sents mySearcher.close() finally: myIndexer.close() mySearcher.close()
def __init__(self, root, storeDir): if not os.path.exists(storeDir): os.mkdir(storeDir) store = SimpleFSDirectory(Paths.get(storeDir)) # analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) analyzer = SmartChineseAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(analyzer) # config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print('commit index', ) threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
def searchResults(command): STORE_DIR = "index" vm_env = getenv() try: vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) except: vm_env.attachCurrentThread() directory = SimpleFSDirectory(Paths.get(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SmartChineseAnalyzer() num, results = run(searcher, analyzer, command) del searcher return num, results # if __name__ == '__main__': # STORE_DIR = "index" # vm_env = getenv() # try: # vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) # except: # vm_env.attachCurrentThread() # print('lucene', lucene.VERSION) # #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) # directory = SimpleFSDirectory(Paths.get(STORE_DIR)) # searcher = IndexSearcher(DirectoryReader.open(directory)) # analyzer = SmartChineseAnalyzer() # command = input("Query:") # num,results = run(searcher, analyzer,command) # print(num) # for result in results: # print(result['url']) # print(result['Acodes']) # print(result['Stockname']) # print(result['relative1']) # print(result['relative2']) # del searcher
contents = unicode(file.read(), 'utf-8') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e if __name__ == '__main__': if len(sys.argv) < 2: print IndexFiles.__doc__ sys.exit(1) lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION start = datetime.now() try: # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) IndexFiles(sys.argv[1], INDEX_DIR, SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet())) end = datetime.now() print end - start except Exception, e: print "Failed: ", e raise e
doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e if __name__ == '__main__': if len(sys.argv) < 2: print IndexFiles.__doc__ sys.exit(1) lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION start = datetime.now() try: # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) IndexFiles( sys.argv[1], INDEX_DIR, SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet())) end = datetime.now() print end - start except Exception, e: print "Failed: ", e raise e
print print "Hit enter with no input to quit." command = raw_input("Query:") command = unicode(command, 'utf-8') if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print doc.get("name1"), doc.get("name2") if __name__ == '__main__': STORE_DIR = "index" vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) vm_env.attachCurrentThread() print 'lucene', lucene.VERSION #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT) run(searcher, analyzer) del searcher