def make_query(keywords): if type(keywords) is unicode: keywords = keywords.encode('utf-8', 'ignore') and_query_list = [] keywords = keywords.split(' ') for keyword in keywords: if len(keyword) > 2 and keyword.startswith('"') and keyword.endswith( '"'): and_query_list.append(xapian.Query(keyword[1:-1], 1)) else: t = [] word2dict = seg_txt_2_dict(keyword) for word, value in word2dict.iteritems(): if word != keyword: t.append(xapian.Query(word, 1)) kt = xapian.Query(keyword, 1) if t: if len(t) > 1: query = xapian.Query(xapian.Query.OP_AND, t) query = xapian.Query(xapian.Query.OP_OR, [kt, query]) else: query = xapian.Query(xapian.Query.OP_OR, [kt, t[0]]) else: query = kt and_query_list.append(query) #for i in and_query_list: #print "!!!",i if len(and_query_list) > 1: query = xapian.Query(xapian.Query.OP_AND, and_query_list) else: query = and_query_list[0] return query
def update_index(self, id, text=None, values=None, data=None): """更新索引 :id: 要替换的id :doc: 新的doc """ try: doc = self.get_document(id) except: return False if text: doc.clear_terms()#清除terms for word, value in seg_txt_2_dict(text).iteritems(): doc.add_term(word) if values: doc.clear_values() for key, value in values.iteritems(): doc.add_value(key, value) if data: doc.set_data(data) try: self.db.replace_document(id, doc) return True except: return False
def index(self, id, text, values={}, data=''): """index to xapian :id: data id :text: search content is utf-8 :returns: boolean """ doc = xapian.Document() for word, value in seg_txt_2_dict(text).iteritems(): print word, value doc.add_term(word) #添加value用于排序,key似乎只能是数字 for key, value in values.iteritems(): doc.add_value(key, value) if data: doc.set_data(data) try: self.db.replace_document(id, doc) return True except: return False
def search(keywords,offset=0,limit=35,enquire=SEARCH_ENQUIRE): import pdb print keywords #pdb.set_trace() query_list = [] for word,value in seg_txt_2_dict(keywords).iteritems(): query = xapian.Query(word,value) query_list.append(query) if len(query_list) != 1: query = xapian.Query(xapian.Query.OP_AND,query_list) else: query = query_list[0] enquire.set_query(query) matches = enquire.get_mset(offset,limit,None) dictsort = {} for m in matches: dictsort[m.docid] = m.rank print m.docid print dir(m) print m.get_docid() print dir(m.document) print m.document.get_docid() #print dictsort ids = sorted(dictsort,key=dictsort.get) ids.reverse() print ids return ids
def search(self, keywords, start_offset=0, end_offset=None): query_list = [] if isinstance(keywords, unicode): keywords = keywords.encode('utf8') for word, value in seg_txt_2_dict(keywords).iteritems(): query = xapian.Query(word, value) query_list.append(query) if len(query_list) != 1: query = xapian.Query(xapian.Query.OP_OR, query_list) else: query = query_list[0] self.SEARCH_ENQUIRE.set_query(query) count = self.SEARCH_DB.get_doccount() if not end_offset: end_offset = count - start_offset matches = self._get_enquire_mset(start_offset, end_offset) results = [] for match in matches: data = self._get_document_data(match.document) data = simplejson.loads(data, encoding='utf8') results.append(data) return {'count': self._get_hit_count(), 'object_list':results}
def index_txt(id,txt): print id doc = xapian.Document() for word,value in seg_txt_2_dict(txt).iteritems(): doc.add_term(word,value) key = ":%s"%id doc.add_term(key) SEARCH_DB.replace_document(key,doc)
def __getTop(self, content, num): content = str(content) result = {} if not content: wlog.warning("Can not splite empty String.") else: items = seg_txt_2_dict(content.decode("gbk").encode("utf-8")) for k,v in sorted(items.items(), key = lambda x:x[1], reverse = True)[0:num+1]: k = k.decode("utf-8").encode("gbk") result[k]=v return result
def index_txt(tid,txt): doc = xapian.Document() for word,value in seg_txt_2_dict(txt).iteritems(): if word: doc.add_term(word,value) else: pass key = ":%s"%str(tid) doc.add_term(key) print dir(doc) SEARCH_DB.replace_document(key,doc)
def index(msg): # create document doc = xapian.Document() doc.set_data(msg) # index msg title msg_dict = json.loads(msg) msg_title = msg_dict.get("title") for word, value in seg_txt_2_dict(msg_title.encode("utf-8")).iteritems(): doc.add_term(word, value) # add document to xapian database MASTER_DB.add_document(doc)
def search(keywords,offset=0,limit=35,enquire=SEARCH_ENQUIRE): query_list = [] for word,value in seg_txt_2_dict(keywords).iteritems(): print word query = xapian.Query(word,value) query_list.append(query) if len(query_list) != 1: query = xapian.Query(xapian.Query.OP_AND,query_list) else: query = query_list[0] enquire.set_query(query) matches = enquire.get_mset(offset,limit,None) return matches
def _index_text(self, doc, termgenerator): try: text = open('text').read() except IOError as err: logger.error(str(err)) return lang = guess_language(text[:1024*100]) logger.debug('lanuage is %s' % lang) if lang == 'chinese': for word, value in seg_txt_2_dict(text).iteritems(): if word: doc.add_term(word, value) else: termgenerator.index_text(text)
def _index_text(self, doc, termgenerator): try: text = open('text').read() except IOError as err: logger.error(str(err)) return lang = guess_language(text[:1024 * 100]) logger.debug('lanuage is %s' % lang) if lang == 'chinese': for word, value in seg_txt_2_dict(text).iteritems(): if word: doc.add_term(word, value) else: termgenerator.index_text(text)
def make_query(keywords): if type(keywords) is unicode: keywords = keywords.encode('utf-8', 'ignore') and_query_list = [] keywords = keywords.split(' ') for keyword in keywords: if len(keyword) > 2 and keyword.startswith('"') and keyword.endswith('"'): and_query_list.append( xapian.Query( keyword[1:-1], 1 ) ) else: t = [] word2dict = seg_txt_2_dict(keyword) for word, value in word2dict.iteritems(): if word != keyword: t.append( xapian.Query( word, 1 ) ) kt = xapian.Query(keyword, 1) if t: if len(t) > 1: query = xapian.Query(xapian.Query.OP_AND, t) query = xapian.Query(xapian.Query.OP_OR, [kt, query]) else: query = xapian.Query(xapian.Query.OP_OR, [kt, t[0]]) else: query = kt and_query_list.append(query) #for i in and_query_list: #print "!!!",i if len(and_query_list) > 1: query = xapian.Query(xapian.Query.OP_AND, and_query_list) else: query = and_query_list[0] return query
def search(self, keywords, offset=0, limit=10): """search xapian :keywords: 搜索的关键字 :offset: 起始位置 :limit: 结束位置 :returns: matches对象 """ query_list = [] for word, value in seg_txt_2_dict(keywords.encode('utf-8')).iteritems(): query = xapian.Query(word) query_list.append(query) if len(query_list) != 1: query = xapian.Query(xapian.Query.OP_AND, query_list) else: query = query_list[0] self.enquire.set_query(query) matches = self.enquire.get_mset(offset, limit, 10000) return matches
def search_software(self, keyword): """search interface""" #***************************************************************************** try: from mmseg.search import seg_txt_search,seg_txt_2_dict query_string = str(keyword) enquire = xapian.Enquire(self.db.xapiandb) query_list = [] for word, value in seg_txt_2_dict(query_string).iteritems(): query = xapian.Query(word, value) # print word,value query_list.append(query) if len(query_list) != 1: query = xapian.Query(xapian.Query.OP_AND, query_list) else: query = query_list[0] # print "*** Useing Chinese Segmentation method MMSEG to segment the input keywords ***" #********************************************************************************* except: Info = """ ********************************************************* There is no Chinese Segmentation method MMSEG in your system.For better useing of ubuntu-kylin-software-center, please install chinese Segmentation method MMSEG . ********************************************************* """ print Info query_string = self.db.get_query_list_from_search_entry(str(keyword)) enquire = xapian.Enquire(self.db.xapiandb) query = query_string[1] # enquire = xapian.Enquire(self.db.xapiandb) # qp = xapian.QueryParser() # qp.set_database(self.db.xapiandb) # query = qp.parse_query(str(keyword)) # print "Parsed query is: %s"% str(query) enquire.set_query(query) matches = enquire.get_mset(0, len(self.db)) # print "res len=",len(self.db),len(matches) pkgnamelist = [] for m in matches: doc = m.document # print m.docid # print '************************************' pkgname = doc.get_value(XapianValues.PKGNAME) if not pkgname: pkgname = doc.get_data() if pkgname: #check weather exist in the list try: index = pkgnamelist.index(pkgname) #not exist will raise ValueError except ValueError: pkgnamelist.append(pkgname) # print pkgnamelist return pkgnamelist
def update_xapiandb(self, kwargs): database = xapian.WritableDatabase(XAPIAN_DB_PATH, xapian.DB_OPEN) DB = xapian.Database(XAPIAN_DB_PATH) enquire = xapian.Enquire(database) indexer = xapian.TermGenerator() if "" == kwargs["pkgname"]: modified_num = 0 add_num = 0 xapiandb_update = "No" query_xapiandb_version = xapian.Query("the_#ukxapiandb#_version") enquire.set_query(query_xapiandb_version) matches = enquire.get_mset(0, 1) for re in matches: docid_for_xapiandb_version = re.document.get_docid() doc_for_xapiandb_version = re.document doc_data = doc_for_xapiandb_version.get_data() if (isinstance(doc_data, bytes)): doc_data = doc_data.decode(encoding='utf-8') if ("XAPIANDB_VERSION" == doc_data): the_latest_update_time = doc_for_xapiandb_version.get_value( 2) #valueslot:2 xapiandb update time if (isinstance(the_latest_update_time, bytes)): the_latest_update_time = the_latest_update_time.decode( encoding='utf-8') else: the_latest_update_time = time.strftime( '%Y-%m-%dT%H:%M:%S', time.localtime()) if (Globals.DEBUG_SWITCH): print( "Failed to get the latest update time from client xapiandb,use default time.localtime()" ) reslist = self.premoter.newerapp_for_xapianupdate( the_latest_update_time) for app in reslist: app_name = str(app["app_name"]) display_name_cn = str(app["display_name_cn"]) keywords_for_search = str(app["keywords_for_search"]) query = xapian.Query(app_name) enquire.set_query(query) doccount = DB.get_doccount() matches = enquire.get_mset(0, doccount) if matches.size() != 0: for re in matches: get_name = re.document.get_data() if (isinstance(get_name, bytes)): get_name = get_name.decode(encoding='utf-8') if get_name == app_name: docid = re.docid doc = re.document doc.clear_terms() indexer.set_document(doc) doc.add_term(app_name, 10) if keywords_for_search != "None": keywords = display_name_cn + ";" + keywords_for_search + ";" + app_name else: keywords = display_name_cn + ";" + app_name indexer.index_text(keywords, 10) try: from mmseg.search import seg_txt_search, seg_txt_2_dict for word, value in seg_txt_2_dict( keywords).items(): if word != "none": doc.add_term(word, 10) else: pass except: if (Globals.DEBUG_SWITCH): print("----No mmseg model---") database.replace_document(docid, doc) xapiandb_update = "Yes" modified_num = modified_num + 1 else: continue else: doc = xapian.Document() doc.set_data(app_name) doc.add_term(app_name, 10) indexer.set_document(doc) if keywords_for_search != "None": keywords = display_name_cn + ";" + keywords_for_search + ";" + app_name else: keywords = display_name_cn + ";" + app_name indexer.index_text(keywords, 10) try: for word, value in seg_txt_2_dict(keywords).items(): if word != "none": doc.add_term(word, 10) else: pass except: pass database.add_document(doc) add_num = add_num + 1 if (Globals.DEBUG_SWITCH): print("App:", doc.get_data(), " ", "terms:", end=' ') for itr in doc.termlist(): if (Globals.DEBUG_SWITCH): print(itr.term, end=' ') xapiandb_update = "Yes" if (Globals.DEBUG_SWITCH): print(" ") try: if xapiandb_update == "Yes": now = time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime()) doc_for_xapiandb_version.add_value(2, now) database.replace_document(docid_for_xapiandb_version, doc_for_xapiandb_version) database.commit() if (Globals.DEBUG_SWITCH): print( "Xapiandb has updated . %d app modified, %d app add. Tatal: %d app updated" % (modified_num, add_num, len(reslist))) except: if (Globals.DEBUG_SWITCH): print( "The xapian database (/home/ice_bird/.cache/uksc/xapiandb) is crashed,please remove it and install a new one!" ) if (Globals.DEBUG_SWITCH): print("update uksc xapiandb over") else: appinfo_query = xapian.Query(kwargs["pkgname"]) enquire.set_query(appinfo_query) matches = enquire.get_mset(0, DB.get_doccount()) for re in matches: doc_for_appinfo = re.document doc_data = doc_for_appinfo.get_data() if kwargs["pkgname"] == doc_data: return doc = xapian.Document() doc.set_data(kwargs["pkgname"]) doc.add_term(kwargs["pkgname"], 10) if (Globals.DEBUG_SWITCH): print("debfile path:", kwargs["path"]) deb = DebFile(kwargs["path"]) terms = kwargs["pkgname"] try: terms = terms + " " + deb.description except: if (Globals.DEBUG_SWITCH): print("Failed to get app description") indexer.set_document(doc) indexer.index_text(terms) database.add_document(doc) database.commit() if (Globals.DEBUG_SWITCH): print("update xapiandb over: ", kwargs["pkgname"], "terms:", end=' ') for itr in doc.termlist(): if (Globals.DEBUG_SWITCH): print(itr.term, end=' ') if (Globals.DEBUG_SWITCH): print(" ")
def _add_hanzi(self, doc, data): if not data: return for word, value in seg_txt_2_dict(data).iteritems(): doc.add_term(word, value)