def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: if filename.endswith('.DS_Store'): continue print("adding" + filename) try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'iso-8859-1') #contents = file.read() print(contents) file.close() doc = lucene.Document() doc.add(lucene.Field("name", filename, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("path", path, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) if len(contents) > 0: doc.add(lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) else: print("no content") print("warning: no content in %s" % filename) writer.addDocument(doc) except Exception, e: print("Failed in indexDocs:" + e)
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): #遍历testfolder下的文件 for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'gbk') #将文件转为unicode再处理,假设原doc编码为GBK。 print contents #文件内容存放在contents中 file.close() doc = lucene.Document() #创建一个Document代表我们要索引的文档 doc.add(lucene.Field("name", filename, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("path", path, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) if len(contents) > 0: doc.add(lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) #将不同的Field加入到文档中。一篇文档有多种信息,如题目,作者,修改时间,内容等。 #不同类型的信息用不同的Field来表示,在本例子中,一共有三类信息进行了索引,一个是 #文件路径,一个是文件名,一个是文件内容。 else: print "warning: no content in %s" % filename writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中 except Exception, e: print "Failed in indexDocs:", e
def add_document(self, document): ''' Adds a new document in the index. ''' doc = lucene.Document() try: #All fields are converted to string since Lucene accepts only textual fields (and binary) doc.add( lucene.Field("id", str(document.id), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("content", ' '.join(document.content['tokens']), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("author", document.author_screen_name, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) formatted_date = lucene.SimpleDateFormat("yyyyMMddHHmmss").parse( str(document.date)) doc.add( lucene.Field( "date", lucene.DateTools.dateToString( formatted_date, lucene.DateTools.Resolution.MINUTE), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) self.writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexHKForumPost(self, pid, uid, tid, title, content, floor, time): try: doc = lucene.Document() doc.add( lucene.NumericField("pid", 8, lucene.Field.Store.YES, True).setLongValue(long(pid))) doc.add( lucene.NumericField("uid", 8, lucene.Field.Store.YES, True).setLongValue(long(uid))) doc.add( lucene.NumericField("tid", 8, lucene.Field.Store.YES, True).setLongValue(long(tid))) doc.add( lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("content", content, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) doc.add( lucene.NumericField("floor", lucene.Field.Store.YES, True).setIntValue(floor)) doc.add( lucene.NumericField("time", lucene.Field.Store.YES, True).setIntValue(time)) self.writer.addDocument(doc) except Exception, e: print "Failed in indexWeibos:", e
def testAdd(self, filepath): writer = lucene.IndexWriter(self.dir, self.getAnalyzer(), False, lucene.IndexWriter.MaxFieldLength.UNLIMITED) #True,建立新索引,False,建立增量索引 file = open(filepath) contents = unicode(file.read(), 'gbk') file.close() doc = lucene.Document() doc.add(lucene.Field("name", os.path.basename(filepath), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("path", filepath, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) if len(contents) > 0: title = self.getTxtAttribute(contents, 'Title') author = self.getTxtAttribute(contents, 'Author') language = self.getTxtAttribute(contents, 'Language') doc.add(lucene.Field("Title", title, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("Author", author, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("Language", language, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) writer.optimize() writer.close()
def build_index(): f = open(data_dir) reader = csv.reader(f) print("开始创建索引") indx = 0 writer = lucene.IndexWriter(directory, analyzer, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) for line in reader: eng, zh = line[0], line[1] doc = lucene.Document() doc.add( lucene.Field('eng', eng, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field('zh', zh, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) writer.addDocument(doc) if indx % 100000 == 0: print("%sK" % (indx / 1000)) indx += 1 print("写引擎优化") writer.optimize() writer.close()
def indexDocs(self, root, writer): t = open('index.txt', "r") while True: line = t.readline() if line: try: line = line.strip().split() url = line[0] doc_name = line[1] print "adding", url path = os.path.join(root, doc_name) f = open(path) tmp = f.read() #str f.close() contents = tmp.decode('gbk', 'ignore') soup = BeautifulSoup(contents) try: title = soup.title.text tmp = str(title).replace('\n','') title = tmp.decode('utf-8') except: title = "None" print title collection = [] #存放imgurl和对应的discription dic = {} p_box = soup.find(id='p-box') #处理一开始左上角一大图和下面几张小图 #print p_box.get('id','') sub_p_box = p_box.div.nextSibling.nextSibling.nextSibling.nextSibling #print sub_p_box.get('class','') #print sub_p_box big_pic = sub_p_box.div.div.div.img dic['imgurl'] = urlparse.urljoin(url, big_pic.get('src','')) dic['discription'] = big_pic.get('alt','') #print dic collection.append(dic) #防止重复 doc = lucene.Document() for i in collection: imgurl = i['imgurl'] discription = i['discription'] discription = " ".join(jieba.cut(discription)) doc.add(lucene.Field("imgurl", imgurl, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("discription", discription, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("url", url, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("urltitle", title, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中 print "----------------------------------------------------" except Exception, e: print "Failed in indexDocs:", e else: break
def indexDocs(self, root, writer): f = open('q_index.txt', 'r') for line in f: qst_num = line[:-2] # for root, dirnames, filenames in os.walk(root): # for filename in filenames: # if not filename.endswith('.txt'): # continue print "adding question", qst_num try: path = os.path.join(root, 'Question_' + qst_num, 'q.txt') file = open(path) contents_read = unicode(file.read(), 'gbk') file.close() contents = contents_read.split('\r\n|||\r\n') qst_name = contents[0] qst_detail = contents[1] qst_topic_blur = contents[2] qst_topic_accu = contents[3] qst_browse = contents[4] qst_follow = contents[5] qst_ans = contents[6] qestion = lucene.Document() qestion.add( lucene.Field("qst_name", qst_name, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) qestion.add( lucene.Field("qst_detail", qst_detail, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) qestion.add( lucene.Field("qst_topic_blur", qst_topic_blur, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) qestion.add( lucene.Field("qst_topic_accu", qst_topic_accu, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) qestion.add( lucene.Field("qst_browse", qst_browse, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) qestion.add( lucene.Field("qst_follow", qst_follow, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) qestion.add( lucene.Field("qst_ans", qst_ans, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) qestion.add( lucene.Field("qst_num", qst_num, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) writer.addDocument(qestion) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): f = open('a_index.txt', 'r') for line in f: line_num = line[:-2] qst_num = line_num.split('|||')[0] ans_num = line_num.split('|||')[1] print "adding answer", ans_num try: path = os.path.join(root, 'Question_' + qst_num, 'q.txt') file = open(path) contents_read = unicode(file.read(), 'gbk') file.close() contents = contents_read.split('\r\n|||\r\n') qst_name = contents[0] path = os.path.join(root, 'Question_' + qst_num, 'Answer', ans_num + '.txt') file = open(path) contents_read = unicode(file.read(), 'gbk') file.close() contents = contents_read.split('\r\n|||\r\n') ans_contents = contents[0] if (ans_contents != 'None'): ans_author = contents[1] ans_like = contents[2] answer = lucene.Document() answer.add( lucene.Field("qst_name", qst_name, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) answer.add( lucene.Field("ans_contents", ans_contents, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) answer.add( lucene.Field("ans_author", ans_author, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) answer.add( lucene.Field("ans_like", ans_like, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) answer.add( lucene.Field("qst_num", qst_num, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) answer.add( lucene.Field("ans_num", ans_num, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) writer.addDocument(answer) else: print "there is no contents in answer", ans_num except Exception, e: print "Failed in indexDocs:", e
def main1(): print "started indexing sample files......" direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) config = lucene.IndexWriterConfig(lucene.Version.LUCENE_CURRENT, analyzer) config.setOpenMode(lucene.IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = lucene.IndexWriter(direc, config) #fix this later.....FieldType not defined #field_type=lucene.FieldType() #field_type.setIndexed(True) #field_type.setStored(False) #field_type.setTokenized(False) file1 = open("nitin.json") data = file1.read() contents = json.loads(data) doc = lucene.Document() field = lucene.Field("name", contents['name'], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) doc.add(field) field = lucene.Field("data", data, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) file1.close() file1 = open("nitin2.json") data = file1.read() contents = json.loads(data) doc = lucene.Document() field = lucene.Field("name", contents['name'], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) doc.add(field) field = lucene.Field("data", data, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) file1.close() writer.optimize() print "Indexed and optimized %d documents" % writer.numDocs() writer.close()
def indexDocs(self, root2, writer): root2 = unicode(root2, "utf8") for r, d, f in os.walk(root2): for dir in d: leibie = dir root3 = root2 + '\\' + dir for root, dirs, files in os.walk(root3): for filename in files: if len(filename) > 180: continue path = os.path.join(root, filename) f = open(path, 'r') for lines in f: lines = unicode(lines, 'utf-8') start = lines.find(':') kind = lines[0:start] content = lines[start + 1::] if kind == 'http': url = 'http:' + content elif kind == 'title': title = content elif kind == 'imgurl': imgurl = content elif kind == 'price': price = content[1::] #print lines[0:start] #print lines #print url, title, imgurl, price f.close() try: doc = lucene.Document() doc.add( lucene.Field("url", url, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("imgurl", imgurl, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("price", price, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("kind", leibie, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t = open('index.txt', "r") while True: line = t.readline() if line: try: line = line.strip().split() url = line[0] doc_name = line[1] print "adding", url path = os.path.join(root, doc_name) f = open(path) tmp = f.read() #str f.close() charset = (chardet.detect(tmp))['encoding'] if charset==None: charset = 'utf-8' #print charset #contents = unicode(tmp, charset) contents = tmp.decode(charset, 'ignore') soup = BeautifulSoup(contents) try: title = soup.title.text tmp = str(title).replace('\n','') title = tmp.decode('utf-8') except: title = "None" print title contents = soup.get_text() contents = " ".join(jieba.cut(contents)) #print contents doc = lucene.Document() doc.add(lucene.Field("name", doc_name, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("path", path, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("url", url, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) if len(contents) > 0: doc.add(lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) else: print "warning: no content in %s" % doc_name writer.addDocument(doc) #IndexWriter调用函数addDocument将索引写到索引文件夹中 except Exception, e: print "Failed in indexDocs:", e else: break
def UpdateIndex(self): # 인덱스를 최신 내용으로 갱신 writer = lucene.IndexWriter(self.indexDir, self.analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) try: # DB에서 내용 가져오기 for row in self.rows: doc = lucene.Document() doc.add( lucene.Field("bookUrl", row[0], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("thumbUrl", row[1], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("price", row[2], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("title", row[3], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("subTitle", row[4], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("author", row[5], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("publisher", row[6], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("publishDate", row[7], lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add( lucene.Field("offcode", row[8], lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) date = str(row[9]).split('-') date = ''.join(date) print 'regDate : ' + date + ' ' + str(type(date)) doc.add( lucene.Field("regDate", date, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) date = str(row[10]).split('-') date = ''.join(date) print 'updateDate : ' + date doc.add( lucene.Field("updateDate", date, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) writer.addDocument(doc) except Exception, e: print "Failed in adding index : %s" % e exit(1)
def indexWeibo(self, tid, text, user_id, created_at): try: doc = lucene.Document() doc.add(lucene.NumericField("id", 8, lucene.Field.Store.YES, True).setLongValue(long(tid))) doc.add(lucene.Field("text", text, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) doc.add(lucene.NumericField("user_id", lucene.Field.Store.YES, True).setIntValue(int(user_id))) doc.add(lucene.NumericField("created_at", lucene.Field.Store.YES, True).setIntValue(created_at)) self.writer.addDocument(doc) except Exception, e: print "Failed in indexWeibos:", e
def build_index(data, num_doc): for word, page_num in data.items(): doc = lucene.Document() doc.add( lucene.Field('word', word, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field('page_num', page_num, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) writer.addDocument(doc) num_doc += 1 return num_doc
def indexDocs(self, root, writer, startDate, endDate): doc_num = 0 docindex_num = 0 for root, dirnames, filenames in os.walk(root): for filename in filenames: doc_num += 1 if doc_num % 1000 == 0: print "Index Searched " + str(doc_num) + " files..." if not filename.endswith('.txt'): continue filedate = datetime.strptime(filename[:8], '%Y%m%d') if not (filedate >= startDate and filedate <= endDate): continue #print "adding", filename docindex_num += 1 try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'utf8') file.close() #替换逗号,句号为空格,并以空格为分割符切割句子 #print "contents:" + conteits.encode('utf8') contents = contents.replace('\n', '') contents = contents.replace(unicode("。", 'utf8'), '###') sentence_num = 0 for sentence in contents.split('###'): #print "sentence:" + sentence.encode("gbk",'ignore') #time.sleep(1) doc = lucene.Document() doc.add( lucene.Field("name", filename, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("path", path, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("sentence_num", str(sentence_num), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) if len(sentence) > 0: doc.add( lucene.Field("sentence", sentence, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) else: print "warning: no content in sentence %d of file %s" % sentence_num, filename writer.addDocument(doc) sentence_num += 1 except Exception, e: #print "Failed in indexDocs:", e error = 1
def store_terms(self, session, index, terms, rec): strm = C3TokenStream(terms) if rec != self.currRec: if self.currDoc: # write it self.writer.addDocument(self.currDoc) doc = lucene.Document() self.currDoc = doc doc.add(lucene.Field(index.id, strm)) doc.add( lucene.Field('id', str(rec), lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) else: doc.add(lucene.Field(index.id, strm))
def index_files(board, time_delta): store = lucene.SimpleFSDirectory( lucene.File(BOARDSPATH + board + '/' + RECENT_INDEX)) writer = lucene.IndexWriter( store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) # writer.setMaxFieldLength(1048576) # 1MB flist = get_all_files(board, time_delta) for filename, owner, title in flist: path = BOARDSPATH + board + '/' + filename if not os.path.exists(path): continue f = open(path, 'r') contents = filter_file(f) debug(contents) try: title = title.decode('gbk') owner = owner.decode('gbk') contents = unicode(contents, 'gbk') except UnicodeDecodeError: f.close() debug(filename) continue f.close() if len(contents) > 0: doc = lucene.Document() doc.add( lucene.Field("name", filename, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("owner", owner, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) writer.addDocument(doc) debug('adding ' + filename) writer.optimize() writer.close()
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = file.read().decode('utf8', 'ignore') file.close() doc = lucene.Document() doc.add( lucene.Field("name", filename, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("path", path, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) if len(contents) > 0: title = self.getTxtAttribute(contents, 'Title') author = self.getTxtAttribute(contents, 'Author') language = self.getTxtAttribute(contents, 'Language') doc.add( lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("author", author, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("language", language, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add( lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def get_word_list(text, is_list=False, field_name = 'fieldname'): if is_list: new_text = "" for i in text: new_text += i + "\n" text = new_text lucene.initVM(lucene.CLASSPATH) analyzer = lucene.KoreanAnalyzer(); #directory = lucene.FSDirectory.open("/tmp/testindex"); directory = lucene.RAMDirectory() # writer writer = lucene.IndexWriter(directory, analyzer) doc = lucene.Document() doc.add(lucene.Field(field_name, text, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)); writer.addDocument(doc); writer.close(); # get all terms from all index ireader = lucene.IndexReader.open(directory, False) term = lucene.Term(field_name, '') termenum = ireader.terms(term) term = termenum.term() i = 0 word_list = [] while term and term.field() == field_name: i += 1 termDocs = ireader.termDocs(term) termDocs.next() #print "[%04d]===> <%s> " % (i, term.text()) #print term.text() + " : " + str(termDocs.freq()) word_list.append({'text': term.text(), 'freq': termDocs.freq()}) term = termenum.next() and termenum.term() ireader.close(); directory.close(); return word_list
def IndexCreate(fileDir, indexDir): analyzer = lucene.StandardAnalyzer() # 루씬에서 사용하는 객체 생성 store = lucene.FSDirectory.getDirectory(indexDir) writer = lucene.IndexWriter(store, analyzer) for root, dirnames, filenames in os.walk(fileDir): # 입력받은 폴더에서 텍스트 파일만 검색 for filename in filenames: if not filename.endswith('.txt'): continue print("Adding: %s" % filename) try: path = os.path.join(root, filename) f = open(path) content = f.read() f.close() content = content.decode('cp949').encode('utf-8') # 인코딩을 'utf-8'로 변경 doc = lucene.Document() # Document 객체 추가 doc.add(lucene.Field( "name", # 파일명 filename, lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add(lucene.Field( "path", # 파일 경로 path, lucene.Field.Store.YES, lucene.Field.Index.NO)) if len(content) > 0: doc.add(lucene.Field( "content", # 파일 내용 content, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED)) else: print("Warning: No contents in %s" % filename) writer.addDocument(doc) # 인덱스에 Document 추가 except Exception, e: print("Failed in adding index: %s" % e)
def UpdateIndex(self): "인덱스를 최신의 내용으로 갱신" self.lastIndexingTime = self.__ReadLatestUpdateTime() # 마지막으로 인덱스한 시간(None-인덱스한 적이 없음) writer = lucene.IndexWriter(self.store, self.analyzer, lucene.IndexWriter.MaxFieldLength(1048576)) for root, dirnames, filenames in os.walk(self.blogDir): for filename in filenames: if not filename.endswith('.txt'): # txt 파일이 아닌 경우 인덱스하지 않음 continue path = os.path.join(root, filename) if (self.lastIndexingTime != None and self.lastIndexingTime >= int(os.stat(path).st_mtime)): continue # 이미 인덱스에 추가된 데이터인 경우 print("Adding: %s" % filename) try: f = open(path) content = f.read() f.close() doc = lucene.Document() doc.add(lucene.Field( "bloger", path.rsplit("\\", 2)[1], # 파일이 들어있는 디렉토리를 블로거로 설정 lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) doc.add(lucene.Field( "path", path, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED)) doc.add(lucene.Field( "contents", content, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED)) writer.addDocument(doc) except Exception, e: print("Failed in adding index: %s" % e)
def index_review_request(self, writer, request): if lucene_is_2x: lucene_tokenized = lucene.Field.Index.TOKENIZED lucene_un_tokenized = lucene.Field.Index.UN_TOKENIZED elif lucene_is_3x: lucene_tokenized = lucene.Field.Index.ANALYZED lucene_un_tokenized = lucene.Field.Index.NOT_ANALYZED else: assert False # There are several fields we want to make available to users. # We index them individually, but also create a big hunk of text # to use for the default field, so people can just type in a # string and get results. doc = lucene.Document() doc.add(lucene.Field('id', str(request.id), lucene.Field.Store.YES, lucene.Field.Index.NO)) doc.add(lucene.Field('summary', request.summary, lucene.Field.Store.NO, lucene_tokenized)) if request.changenum: doc.add(lucene.Field('changenum', unicode(request.changenum), lucene.Field.Store.NO, lucene_tokenized)) # Remove commas, since lucene won't tokenize it right with them bugs = ' '.join(request.bugs_closed.split(',')) doc.add(lucene.Field('bug', bugs, lucene.Field.Store.NO, lucene_tokenized)) name = ' '.join([request.submitter.username, request.submitter.get_full_name()]) doc.add(lucene.Field('author', name, lucene.Field.Store.NO, lucene_tokenized)) doc.add(lucene.Field('username', request.submitter.username, lucene.Field.Store.NO, lucene_un_tokenized)) # FIXME: index reviews # FIXME: index dates files = [] if request.diffset_history: for diffset in request.diffset_history.diffsets.all(): for filediff in diffset.files.all(): if filediff.source_file: files.append(filediff.source_file) if filediff.dest_file: files.append(filediff.dest_file) aggregate_files = '\n'.join(set(files)) # FIXME: this tokenization doesn't let people search for files # in a really natural way. It'll split on '/' which handles the # majority case, but it'd be nice to be able to drill down # (main.cc, vmuiLinux/main.cc, and player/linux/main.cc) doc.add(lucene.Field('file', aggregate_files, lucene.Field.Store.NO, lucene_tokenized)) text = '\n'.join([request.summary, request.description, unicode(request.changenum), request.testing_done, bugs, name, aggregate_files]) doc.add(lucene.Field('text', text, lucene.Field.Store.NO, lucene_tokenized)) writer.addDocument(doc)
def index_ontology_files(oboFile, outDir, xref_map): """ Iterates over our list of ontology files and creates an index for each file. """ lucene.initVM() analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) # Handle a little bit of lucene setup filename, _ext = os.path.splitext(os.path.basename(oboFile)) indexDir = os.path.join(outDir, filename) if os.path.exists(indexDir): raise ExistingIndexDirectoryException( 'Error, attempted to index same file twice or index two files named the same' ) dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = lucene.IndexWriter(dir, analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) for term in oboparser.parse(oboFile, ['is_a']): if term.obsolete: continue doc = lucene.Document() add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 4.0) # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could # query hits that we would not want to occur thus errantly increasing the score of the field. # We will strip out these hyperlinks and index just the text. add_field_to_document(doc, "definition", strip_urls_from_text(term.definition), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 0.4) # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists # in our Ontology object and need to be entered in one at a time add_fields_to_document(doc, "synonym", [x[0] for x in term.synonyms if x], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED, 0.7) add_fields_to_document(doc, "alt_id", term.alternateIds, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "xref", [replace_xref_identifier(x, xref_map) for x in term.xrefs], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "relationship", [" ".join(list(x)) for x in list(term.relationships)], lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED) add_fields_to_document(doc, "subset", term.subsets, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) writer.addDocument(doc) writer.optimize() writer.close()
def addMessage(self, username, xprotocol, xfriend_chat, who_sent, timestamp, text): #Clean up protocol and friend_chat fields """ For some unknown reason, PyLucene (and probably Lucene as well) seems to have problems searching for things like SoAndSo but has no problems searching for soandso. To prevent headaches in the future we simply set it all to lowercase since the case does not matter for these fields.""" protocol = xprotocol.lower() friend_chat = xfriend_chat.lower() #Determine index and data paths index_dir = self.indexdir + username data_dir = self.datadir + username + PATH_SEP + protocol + PATH_SEP data_file = data_dir + friend_chat #if the index doesn't exist, we use a sepcial constructor to create it if os.path.isdir(index_dir) == False: os.makedirs(index_dir) luc_index = lucene.FSDirectory.getDirectory(index_dir, True) luc_writer = lucene.IndexWriter(luc_index, lucene.StandardAnalyzer(), True) else: luc_index = lucene.FSDirectory.getDirectory(index_dir) luc_writer = lucene.IndexWriter(luc_index, lucene.StandardAnalyzer()) #Opening the index before writing to the file gives us a lock #on the index. As long as writing to data files occurs only #through this function, this is guaranteed to be an atomic #operation. Closing the writer releases the lock. if os.path.isdir(data_dir) == False: os.makedirs(data_dir) #filesize is used to determine the file offset if os.path.isfile(data_file) == False: filesize = 0 else: filesize = os.path.getsize(data_file) datahandle = open(data_file, 'a') datahandle.write(str(who_sent)) datahandle.write("\n") datahandle.write(str(timestamp)) datahandle.write("\n") datahandle.write(str(len(str(text)))) #what a mess datahandle.write("\n") datahandle.write(str(text)) datahandle.write("\n") doc = lucene.Document() doc.add(self.__makeKeywordField('protocol', str(protocol))) doc.add(self.__makeKeywordField('friend_chat', str(friend_chat))) clean_timestamp = self.__padTimestamp(timestamp) doc.add(self.__makeKeywordField('timestamp', clean_timestamp)) doc.add(self.__makeKeywordField('who_sent', str(who_sent))) doc.add(self.__makeUnIndexedField('file_offset', str(filesize))) clean_text = re.sub("<[^>]*>", " ", str(text)) doc.add(self.__makeUnStoredField('text', clean_text)) luc_writer.addDocument(doc) luc_writer.close()
lucene.initVM() print("lucene version is:", lucene.VERSION) # Get the analyzer analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) # Get index storage store = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) # Get index writer writer = lucene.IndexWriter(store, analyzer, True, lucene.IndexWriter.MaxFieldLength.LIMITED) try: # create a document that would we added to the index doc = lucene.Document() # Add a field to this document field = lucene.Field("titlendia", lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) # Add this field to the document doc.add(field) # Add the document to the index writer.addDocument(doc) except Exception as e: print("Failed in indexDocs:", e)