def build_index(): f = open(data_dir) reader = csv.reader(f) print("开始创建索引") indx = 0 writer = lucene.IndexWriter(directory, analyzer, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) for line in reader: eng, zh = line[0], line[1] doc = lucene.Document() doc.add( lucene.Field('eng', eng, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field('zh', zh, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) writer.addDocument(doc) if indx % 100000 == 0: print("%sK" % (indx / 1000)) indx += 1 print("写引擎优化") writer.optimize() writer.close()
def testAdd(self, filepath): writer = lucene.IndexWriter(self.dir, self.getAnalyzer(), False, lucene.IndexWriter.MaxFieldLength.UNLIMITED) #True,建立新索引,False,建立增量索引 file = open(filepath) contents = unicode(file.read(), 'gbk') file.close() doc = lucene.Document() doc.add(lucene.Field("name", os.path.basename(filepath), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("path", filepath, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) if len(contents) > 0: title = self.getTxtAttribute(contents, 'Title') author = self.getTxtAttribute(contents, 'Author') language = self.getTxtAttribute(contents, 'Language') doc.add(lucene.Field("Title", title, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("Author", author, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("Language", language, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) writer.optimize() writer.close()
def UpdateIndex(self): # 인덱스를 최신 내용으로 갱신 writer = lucene.IndexWriter(self.indexDir, self.analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) try: # DB에서 내용 가져오기 for row in self.rows: doc = lucene.Document() doc.add( lucene.Field("bookUrl", row[0], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("thumbUrl", row[1], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("price", row[2], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("title", row[3], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("subTitle", row[4], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("author", row[5], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("publisher", row[6], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("publishDate", row[7], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("offcode", row[8], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) date = str(row[9]).split('-') date = ''.join(date) print 'regDate : ' + date + ' ' + str(type(date)) doc.add( lucene.Field("regDate", date, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) date = str(row[10]).split('-') date = ''.join(date) print 'updateDate : ' + date doc.add( lucene.Field("updateDate", date, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) writer.addDocument(doc) except Exception, e: print "Failed in adding index : %s" % e exit(1)
def __init__(self, network): self.network = network smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33) analyzers = {"smartcn": smartcn} self.pgconn = mypass.getConn() writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"]) writerconfig.setWriteLockTimeout(600000L) writerconfig.setMaxThreadStates(50) writerconfig.setRAMBufferSizeMB(128.0) self.storeDir = self.storeDirBase + self.network store = lucene.SimpleFSDirectory(lucene.File(self.storeDir)) self.writer = lucene.IndexWriter(store, writerconfig)
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = lucene.SimpleFSDirectory(lucene.File(storeDir)) writer = lucene.IndexWriter(store, analyzer, True, lucene.IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer) ticker = Ticker() print 'optimizing index', threading.Thread(target=ticker.run).start() writer.optimize() writer.close() ticker.tick = False print 'done'
def __init__(self): smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33) analyzers = { "smartcn": smartcn } self.pgconn = mypass.getConn() self.sw = sinaweibooauth.SinaWeiboOauth() if not os.path.exists(self.storeDir): os.mkdir(self.storeDir) store = lucene.SimpleFSDirectory(lucene.File(self.storeDir)) writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"]) writerconfig.setWriteLockTimeout(600000L) writerconfig.setMaxThreadStates(50) writerconfig.setRAMBufferSizeMB(128.0) self.writer = lucene.IndexWriter(store, writerconfig)
def __init__(self, index_dir): ''' Initialises index parameters ''' lucene.initVM() self.index_dir = index_dir if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) store = lucene.SimpleFSDirectory(lucene.File(self.index_dir)) self.analyser = PorterStemmerAnalyzer() self.writer = lucene.IndexWriter( store, self.analyser, True, lucene.IndexWriter.MaxFieldLength.LIMITED) self.writer.setMaxFieldLength(1048576) directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir)) self.reader = lucene.FilterIndexReader.open(directory, True)
def index_files(board, time_delta): store = lucene.SimpleFSDirectory( lucene.File(BOARDSPATH + board + '/' + RECENT_INDEX)) writer = lucene.IndexWriter( store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) # writer.setMaxFieldLength(1048576) # 1MB flist = get_all_files(board, time_delta) for filename, owner, title in flist: path = BOARDSPATH + board + '/' + filename if not os.path.exists(path): continue f = open(path, 'r') contents = filter_file(f) debug(contents) try: title = title.decode('gbk') owner = owner.decode('gbk') contents = unicode(contents, 'gbk') except UnicodeDecodeError: f.close() debug(filename) continue f.close() if len(contents) > 0: doc = lucene.Document() doc.add( lucene.Field("name", filename, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("owner", owner, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) writer.addDocument(doc) debug('adding ' + filename) writer.optimize() writer.close()
def get_word_list(text, is_list=False, field_name = 'fieldname'): if is_list: new_text = "" for i in text: new_text += i + "\n" text = new_text lucene.initVM(lucene.CLASSPATH) analyzer = lucene.KoreanAnalyzer(); #directory = lucene.FSDirectory.open("/tmp/testindex"); directory = lucene.RAMDirectory() # writer writer = lucene.IndexWriter(directory, analyzer) doc = lucene.Document() doc.add(lucene.Field(field_name, text, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)); writer.addDocument(doc); writer.close(); # get all terms from all index ireader = lucene.IndexReader.open(directory, False) term = lucene.Term(field_name, '') termenum = ireader.terms(term) term = termenum.term() i = 0 word_list = [] while term and term.field() == field_name: i += 1 termDocs = ireader.termDocs(term) termDocs.next() #print "[%04d]===> <%s> " % (i, term.text()) #print term.text() + " : " + str(termDocs.freq()) word_list.append({'text': term.text(), 'freq': termDocs.freq()}) term = termenum.next() and termenum.term() ireader.close(); directory.close(); return word_list
def __init__(self, root, storeDir, analyzer, startDate, endDate): if not os.path.exists(storeDir): os.mkdir(storeDir) store = lucene.SimpleFSDirectory(lucene.File(storeDir)) # 创建IndexWriter对象,第一个参数是Directory,第二个是分词器, # 第三个表示是否是创建,如果为false为在此基础上面修改, # 第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,一般用IndexWriter.MaxFieldLength.LIMITED writer = lucene.IndexWriter(store, analyzer, False, lucene.IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer, startDate, endDate) ticker = Ticker() print 'optimizing index', threading.Thread(target=ticker.run).start() writer.optimize() writer.close() ticker.tick = False print 'done'
def main1(): print "started indexing sample files......" direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) config = lucene.IndexWriterConfig(lucene.Version.LUCENE_CURRENT, analyzer) config.setOpenMode(lucene.IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = lucene.IndexWriter(direc, config) #fix this later.....FieldType not defined #field_type=lucene.FieldType() #field_type.setIndexed(True) #field_type.setStored(False) #field_type.setTokenized(False) file1 = open("nitin.json") data = file1.read() contents = json.loads(data) doc = lucene.Document() field = lucene.Field("name", contents['name'], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) doc.add(field) field = lucene.Field("data", data, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) file1.close() file1 = open("nitin2.json") data = file1.read() contents = json.loads(data) doc = lucene.Document() field = lucene.Field("name", contents['name'], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) doc.add(field) field = lucene.Field("data", data, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) file1.close() writer.optimize() print "Indexed and optimized %d documents" % writer.numDocs() writer.close()
def IndexCreate(fileDir, indexDir): analyzer = lucene.StandardAnalyzer() # 루씬에서 사용하는 객체 생성 store = lucene.FSDirectory.getDirectory(indexDir) writer = lucene.IndexWriter(store, analyzer) for root, dirnames, filenames in os.walk(fileDir): # 입력받은 폴더에서 텍스트 파일만 검색 for filename in filenames: if not filename.endswith('.txt'): continue print("Adding: %s" % filename) try: path = os.path.join(root, filename) f = open(path) content = f.read() f.close() content = content.decode('cp949').encode('utf-8') # 인코딩을 'utf-8'로 변경 doc = lucene.Document() # Document 객체 추가 doc.add(lucene.Field( "name", # 파일명 filename, lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add(lucene.Field( "path", # 파일 경로 path, lucene.Field.Store.YES, lucene.Field.Index.NO)) if len(content) > 0: doc.add(lucene.Field( "content", # 파일 내용 content, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED)) else: print("Warning: No contents in %s" % filename) writer.addDocument(doc) # 인덱스에 Document 추가 except Exception, e: print("Failed in adding index: %s" % e)
def UpdateIndex(self): "인덱스를 최신의 내용으로 갱신" self.lastIndexingTime = self.__ReadLatestUpdateTime() # 마지막으로 인덱스한 시간(None-인덱스한 적이 없음) writer = lucene.IndexWriter(self.store, self.analyzer, lucene.IndexWriter.MaxFieldLength(1048576)) for root, dirnames, filenames in os.walk(self.blogDir): for filename in filenames: if not filename.endswith('.txt'): # txt 파일이 아닌 경우 인덱스하지 않음 continue path = os.path.join(root, filename) if (self.lastIndexingTime != None and self.lastIndexingTime >= int(os.stat(path).st_mtime)): continue # 이미 인덱스에 추가된 데이터인 경우 print("Adding: %s" % filename) try: f = open(path) content = f.read() f.close() doc = lucene.Document() doc.add(lucene.Field( "bloger", path.rsplit("\\", 2)[1], # 파일이 들어있는 디렉토리를 블로거로 설정 lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) doc.add(lucene.Field( "path", path, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) doc.add(lucene.Field( "contents", content, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED)) writer.addDocument(doc) except Exception, e: print("Failed in adding index: %s" % e)
def handle_noargs(self, **options): siteconfig = SiteConfiguration.objects.get_current() # Refuse to do anything if they haven't turned on search. if not siteconfig.get("search_enable"): sys.stderr.write('Search is currently disabled. It must be ' 'enabled in the Review Board administration ' 'settings to run this command.\n') sys.exit(1) if not have_lucene: sys.stderr.write('PyLucene is required to build the search index.\n') sys.exit(1) incremental = options.get('incremental', True) store_dir = siteconfig.get("search_index_file") if not os.path.exists(store_dir): os.mkdir(store_dir) timestamp_file = os.path.join(store_dir, 'timestamp') timestamp = 0 if incremental: try: f = open(timestamp_file, 'r') timestamp = datetime.utcfromtimestamp(int(f.read())) f.close() except IOError: incremental = False f = open(timestamp_file, 'w') f.write('%d' % time.time()) f.close() if lucene_is_2x: store = lucene.FSDirectory.getDirectory(store_dir, False) writer = lucene.IndexWriter(store, False, lucene.StandardAnalyzer(), not incremental) elif lucene_is_3x: store = lucene.FSDirectory.open(lucene.File(store_dir)) writer = lucene.IndexWriter(store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), not incremental, lucene.IndexWriter.MaxFieldLength.LIMITED) else: assert False status = Q(status='P') | Q(status='S') objects = ReviewRequest.objects.filter(status) if incremental: query = Q(last_updated__gt=timestamp) # FIXME: re-index based on reviews once reviews are indexed. I # tried ORing this in, but it doesn't seem to work. # Q(review__timestamp__gt=timestamp) objects = objects.filter(query) if sys.stdout.isatty(): print 'Creating Review Request Index' totalobjs = objects.count() i = 0 prev_pct = -1 for request in objects: try: # Remove the old documents from the index if incremental: writer.deleteDocuments(lucene.Term('id', str(request.id))) self.index_review_request(writer, request) if sys.stdout.isatty(): i += 1 pct = (i * 100 / totalobjs) if pct != prev_pct: sys.stdout.write(" [%s%%]\r" % pct) sys.stdout.flush() prev_pct = pct except Exception, e: sys.stderr.write('Error indexing ReviewRequest #%d: %s\n' % \ (request.id, e))
def index_ontology_files(oboFile, outDir, xref_map): """ Iterates over our list of ontology files and creates an index for each file. """ lucene.initVM() analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) # Handle a little bit of lucene setup filename, _ext = os.path.splitext(os.path.basename(oboFile)) indexDir = os.path.join(outDir, filename) if os.path.exists(indexDir): raise ExistingIndexDirectoryException( 'Error, attempted to index same file twice or index two files named the same' ) dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = lucene.IndexWriter(dir, analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) for term in oboparser.parse(oboFile, ['is_a']): if term.obsolete: continue doc = lucene.Document() add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 4.0) # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could # query hits that we would not want to occur thus errantly increasing the score of the field. # We will strip out these hyperlinks and index just the text. add_field_to_document(doc, "definition", strip_urls_from_text(term.definition), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 0.4) # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists # in our Ontology object and need to be entered in one at a time add_fields_to_document(doc, "synonym", [x[0] for x in term.synonyms if x], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED, 0.7) add_fields_to_document(doc, "alt_id", term.alternateIds, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "xref", [replace_xref_identifier(x, xref_map) for x in term.xrefs], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "relationship", [" ".join(list(x)) for x in list(term.relationships)], lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED) add_fields_to_document(doc, "subset", term.subsets, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) writer.addDocument(doc) writer.optimize() writer.close()
#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, #DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY #THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT #(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE #OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ Creates a sample Lucene index for the full-text search feature. """ import lucene import sys if __name__ == "__main__": lucene.initVM() indexDir = "D:/Downloads/index" dir_ = lucene.SimpleFSDirectory(lucene.File(indexDir)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = lucene.IndexWriter(dir_, analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) print("Currently there are %d documents in the index..." % writer.numDocs()) content = ( "Strategische Konzeption, Umsetzung und Betreuung von langfristig " + "hochwirksamen und messbar erfolgreichen Maßnahmen in Social Media.") doc = lucene.Document() doc.add( lucene.Field("content", content, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("filePath", "Projekte/bericht.txt", lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) writer.addDocument(doc)
def addMessage(self, username, xprotocol, xfriend_chat, who_sent, timestamp, text): #Clean up protocol and friend_chat fields """ For some unknown reason, PyLucene (and probably Lucene as well) seems to have problems searching for things like SoAndSo but has no problems searching for soandso. To prevent headaches in the future we simply set it all to lowercase since the case does not matter for these fields.""" protocol = xprotocol.lower() friend_chat = xfriend_chat.lower() #Determine index and data paths index_dir = self.indexdir + username data_dir = self.datadir + username + PATH_SEP + protocol + PATH_SEP data_file = data_dir + friend_chat #if the index doesn't exist, we use a sepcial constructor to create it if os.path.isdir(index_dir) == False: os.makedirs(index_dir) luc_index = lucene.FSDirectory.getDirectory(index_dir, True) luc_writer = lucene.IndexWriter(luc_index, lucene.StandardAnalyzer(), True) else: luc_index = lucene.FSDirectory.getDirectory(index_dir) luc_writer = lucene.IndexWriter(luc_index, lucene.StandardAnalyzer()) #Opening the index before writing to the file gives us a lock #on the index. As long as writing to data files occurs only #through this function, this is guaranteed to be an atomic #operation. Closing the writer releases the lock. if os.path.isdir(data_dir) == False: os.makedirs(data_dir) #filesize is used to determine the file offset if os.path.isfile(data_file) == False: filesize = 0 else: filesize = os.path.getsize(data_file) datahandle = open(data_file, 'a') datahandle.write(str(who_sent)) datahandle.write("\n") datahandle.write(str(timestamp)) datahandle.write("\n") datahandle.write(str(len(str(text)))) #what a mess datahandle.write("\n") datahandle.write(str(text)) datahandle.write("\n") doc = lucene.Document() doc.add(self.__makeKeywordField('protocol', str(protocol))) doc.add(self.__makeKeywordField('friend_chat', str(friend_chat))) clean_timestamp = self.__padTimestamp(timestamp) doc.add(self.__makeKeywordField('timestamp', clean_timestamp)) doc.add(self.__makeKeywordField('who_sent', str(who_sent))) doc.add(self.__makeUnIndexedField('file_offset', str(filesize))) clean_text = re.sub("<[^>]*>", " ", str(text)) doc.add(self.__makeUnStoredField('text', clean_text)) luc_writer.addDocument(doc) luc_writer.close()
import os, re, sys, lucene from BeautifulSoup import BeautifulSoup import lxml.html SYMBOL = '[-_\s\n,.<>/?:;\"\'\[\]{ }\\\|`~!@#$%^&*()=\+]+' pattern = re.compile(SYMBOL) # Pretreatment INDEX_DIR = 'index' lucene.initVM() directory = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = lucene.IndexWriter(directory, analyzer, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) def visible(element): if element.parent.name in [ 'style', 'script', '[document]', 'head', 'title' ]: return False elif re.match('<!--.*-->', str(element)): return False return True def get_page_num(file): tmp = file.split('/') return tmp[1] + '/' + tmp[2]
def begin_indexing(self, session, index): # will append if exists, or create if not if not self.writer: self.writer = lucene.IndexWriter( self.dir, self.analyzer, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
if __name__ == '__main__': INDEX_DIR = "/home/andrew/lucene_index" # Initialize lucene and JVM lucene.initVM() print("lucene version is:", lucene.VERSION) # Get the analyzer analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) # Get index storage store = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) # Get index writer writer = lucene.IndexWriter(store, analyzer, True, lucene.IndexWriter.MaxFieldLength.LIMITED) try: # create a document that would we added to the index doc = lucene.Document() # Add a field to this document field = lucene.Field("titlendia", lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) # Add this field to the document doc.add(field) # Add the document to the index writer.addDocument(doc)