def delete(self, tid, title, field): '''删除索引''' if tid is None or tid == '' \ or title is None or title == '' \ or field is None or field == '': return False # 原始数据 self.tid = tid self.title = title self.field = field # 删除保存原始数据的Hash # self.r.hdel(self.field, self.tid) self.pipeline.hdel(self.field, self.tid) # 删除能通过sort命令的get参数获取数据的原始数据 # self.pipeline.delete('%s:%s' % (self.field, self.tid)) # 分词 algor = mmseg.Algorithm(self.title) words = [] for tok in algor: # 不区分大小写 word = tok.text.decode('utf-8').lower() words.append(word) # 删除倒排索引,以分词为key # self.r.srem('%s:%s' % (self.field, word), self.tid) self.pipeline.srem('%s:%s' % (self.field, word), self.tid) # 删除用于搜索结果排序的score # self.r.del('%s:score:%s' % (self.field, self.tid)) self.pipeline.delete('%s:score:%s' % (self.field, self.tid)) # 前缀索引 if self.config.prefix_index_enable is True: # 不区分大小写 word = self.title.decode('utf-8').lower() # 前缀索引不包括分词内容 del words[:] words.append(word) # 删除倒排索引,以分词为key # self.r.srem('%s:%s' % (self.field, word), self.tid) self.pipeline.srem('%s:%s' % (self.field, word), self.tid) dic = [] for word in words: for i in range(len(word)): prefix = word[:i+1] dic.append(prefix) # print prefix.encode('utf-8') # 完整的词增加一项,用*号区分 prefix = '%s%s' % (word, '*') dic.append(prefix) # self.r.zrem('compl:%s' % (self.field), *dic) self.pipeline.zrem('compl:%s' % (self.field), *dic) self.pipeline.execute() return True
def GenWordSegString(self,ss,FID): rawText = ss segRes = mmseg.Algorithm(rawText) self.segFID = FID self.segLst = list() for w in segRes: if not w.text.isalnum(): self.segLst.append(w.text) self.segInMemo = True return self
def segment(text): ''' text should be either utf8 or unicode return a list of words in unicode ''' if isinstance(text, unicode): text = text.encode('utf8') alg = mmseg.Algorithm(text) # print '%s [%d..%d]' % (tok.text, tok.start, tok.end) for tok in alg return [tok.text.decode('utf8') for tok in alg]
def GenWordSeg(self,inputFile): with open(inputFile) as f: rawText = f.read() segRes = mmseg.Algorithm(rawText) self.segFID = inputFile self.segLst = list() for w in segRes: if not w.text.isalnum(): self.segLst.append(w.text) self.segInMemo = True return self
def post(self): keyword = self.get_argument('keyword') algor = mmseg.Algorithm(keyword.encode('utf-8')) ret = [] for tok in algor: ret.append({'start' : tok.start}) ret.append({'end' : tok.end}) ret.append({'length' : tok.length}) ret.append({'text' : tok.text.decode('utf-8')}) self.write(tornado.escape.json_encode(ret)) self.finish()
def build_index_database(key, fic, pos): """build index Args: key: the content which gona to be segmented fic: fiction object which contains the key pos: position of the key word in fiction Return: None """ try: words = mmseg.Algorithm(key) except Exception, e: print e return
def word_split(sentence, shall_print=0): # python word split is incorrect for float, for example,"16.5 中午吃饭", # will be splited as "16 5 中午吃饭", instead of "16.5 中午 吃饭" SPECIAL_CHARACTER_FOR_FLOAT = 'a' sentence = sentence.replace('。', SPECIAL_CHARACTER_FOR_FLOAT) sentence = sentence.replace('.', SPECIAL_CHARACTER_FOR_FLOAT) algor = mmseg.Algorithm(sentence) token_list = [] for tok in algor: if tok.text.replace(SPECIAL_CHARACTER_FOR_FLOAT, '0').isdigit(): token_list.append((tok.text.replace(SPECIAL_CHARACTER_FOR_FLOAT, '.'), \ tok.start, tok.end)) else: token_list.append((tok.text, tok.start, tok.end)) # temporarily print for text, start, end in token_list: if shall_print == 1: log.info("%s, %d, %d" % (text, start, end)) else: if shall_print == 2: print "%s, %d, %d" % (text, start, end) return token_list
# -*- coding: utf8 -*- from pymmseg import mmseg mmseg.dict_load_defaults() text = '工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作' algor = mmseg.Algorithm(text) for tok in algor: print '%s [%d..%d]' % (tok.text.decode('utf8'), tok.start, tok.end)
def test(text): mmseg.dict_load_defaults() algor = mmseg.Algorithm(text) for tok in algor: print '%s [%d..%d]' % (tok.text, tok.start, tok.end)
def index(self, path, field): with open(path, 'r') as fp: i = 0 data = {} inverted_index = defaultdict(list) scores = {} prefix_index = [] for line in fp: i = i + 1 if i % 20000 == 0: print i # 原始数据保存到Hash self.r.hmset(field, data) #self.pipeline.hmset(field, data) # 建立倒排索引,以分词为key for w in inverted_index: self.pipeline.sadd('%s:%s' % (field, w), *inverted_index[w]) self.pipeline.execute() # 用于搜索结果排序的score self.r.mset(scores) #self.pipeline.mset(scores) # 前缀索引 self.r.zadd('compl:%s' % (field), *prefix_index) #self.pipeline.zadd('compl:%s' % (field), *prefix_index) #self.pipeline.execute() data.clear() inverted_index.clear() scores.clear() del prefix_index[:] tid, uid, title, attachments = line.strip().split('\t') score = 0 data[tid] = sj.dumps({'tid' : tid, 'title' : title, 'field' : field}) # 分词 algor = mmseg.Algorithm(title) words = [] for tok in algor: # 不区分大小写 word = tok.text.decode('utf-8').lower() words.append(word) inverted_index[word].append(tid) scores['%s:score:%s' % (field, tid)] = score # 前缀索引 if self.config.prefix_index_enable is True: # 不区分大小写 word = title.decode('utf-8').lower() # 前缀索引不包括分词内容 del words[:] words.append(word) inverted_index[word].append(tid) for w in words: for j in range(len(w)): prefix = w[:j+1] prefix_index.append(prefix) prefix_index.append(0.0) # 完整的词增加一项,用*号区分 prefix = '%s%s' % (w, '*') prefix_index.append(prefix) prefix_index.append(0.0) # 原始数据保存到Hash self.r.hmset(field, data) #self.pipeline.hmset(field, data) # 建立倒排索引,以分词为key for w in inverted_index: self.pipeline.sadd('%s:%s' % (field, w), *inverted_index[w]) self.pipeline.execute() # 用于搜索结果排序的score self.r.mset(scores) #self.pipeline.mset(scores) # 前缀索引 self.r.zadd('compl:%s' % (field), *prefix_index)
def run(self): """thread method""" #get all the fresh information _headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", "Accept": "text/plain"} request = urllib2.Request(self.newest_url, headers = _headers) html_page = urllib2.urlopen(request).read() try: import gzip, StringIO data = html_page data = StringIO.StringIO(data) gzipper = gzip.GzipFile(fileobj=data) html = gzipper.read() html_page = html except: pass print '抓取%s网站更新线程启动...' % self.thread_name print '获取更新部分....', html_page = BeautifulSoup(html_page) content = html_page.findAll(self.content_tag, self.content_dict) contents = ''.join([str(item) for item in content]) chapter_infor = BeautifulSoup(contents) content = chapter_infor.findAll(self.chapter_tag, self.chapter_dict) print ' Done.' indexs = 1 for item in content: print '获取第%d个基本信息' % indexs, indexs += 1 contents = str(item) types = ''.join(re.findall(self.types_pattern, contents)) title = ''.join(re.findall(self.title_pattern, contents)) chapter = ''.join(re.findall(self.chapter_pattern, contents)) author = ''.join(re.findall(self.author_pattern, contents)) fiction_url = ''.join(re.findall(self.fiction_url_pattern, contents)) chapter_url = ''.join(re.findall(self.chapter_url_pattern, contents)) if not types or not title or \ not chapter or not author or not fiction_url or not chapter_url: print 'Failed.' continue print chapter_url newest_chapter_url = chapter_url print '标题:%s, 作者:%s, 小说主页%s' %(title, author, fiction_url), print 'Done.' host = self.host if self.host[len(self.host) - 1] == '/': host = self.host[:len(self.host) - 1] if chapter_url[0] == '/': chapter_url = host + chapter_url if fiction_url[0] == '/': fiction_url = host + fiction_url try: web_site = FictionWebSite.objects.get(url = self.host) except: web_site = FictionWebSite(title = self.thread_name, url = self.host) web_site.save() try: hash_url = HashUrl.objects.get(urls = fiction_url) is_exit = True fic = Fiction.objects.get(fiction_title = title, author = author) except: is_exit = False if not is_exit: try: hash_url = HashUrl(urls = fiction_url) hash_url.save() except: continue #if the fiction got by crawler is the newest one #get the book infor print '获取小说%s详细信息' % title, book_infor = get_book_infor(self.host, self.thread_name, fiction_url, True) print 'Done.' ids = re.findall(ALL_PATTERN[web_site.title]['ids_pattern'], fiction_url) types = '4' if not STYLE[self.thread_name].has_key(book_infor['types']) else \ STYLE[self.thread_name][(book_infor['types'])] try: fic = Fiction(fiction_title = title, fiction_avatar_url = book_infor['avatar'], fiction_intro = book_infor['intro'], fiction_id = ids[0], fiction_style = types, total_word = book_infor['total_word'], com_word = "", source_site = web_site, click_time = book_infor['click_time'], rec_time = book_infor['rec_time'], author = author, stock_time = 0, author_url = "", ) fic.save() fic.fiction_nid = create_nid(fic.id) fic.save() member = MemberShip(fiction = fic, website = web_site, fiction_url = fiction_url) member.save() del member except: continue #search only by fiction title for item in mmseg.Algorithm(title): try: index = Index.objects.get(key = item.text) except: index = Index(key = item.text) index.save() IndexFictionRelationship.objects.create(key = index, fiction = fic, position = ','.join([str(item.start), str(item.end)]), bit = '2',#chapter ) #get all chapters if book_infor.has_key('read_url'): chapter_url = book_infor['read_url'] else: chapter_url = build_url_fiction(ids[0], web_site.title) print '获取所有章节.' , get_chapters_thread = threading.Thread(target = chapter_func[web_site.title], args = (chapter_url, fic, web_site)) get_chapters_thread.start() get_chapters_thread.join() print 'done.' #if the fiction has been inserted into the database before else: #get the max index of chapters try: chapter_index = ChapterIndex.objects.get(fiction = fic.id, web_site = web_site.title) except: continue chapter_index.id += 1 chapter_index.save() #get the chapter try: chap = Chapter.objects.get(fiction = fic, index = chapter_index.id) except: chap = Chapter(chapter_title = chapter, charpter_url = newest_chapter_url, fiction = fic, source = web_site, index = chapter_index.id, through = '0',#from udpate thread ) chap.save() try: chapter_url = ChapterUrl.objects.get(url = chapter_url) except: chapter_url = ChapterUrl(url = chapter_url, chapter = chap, fiction = fic, index = chapter_index.id, name = web_site.title) chapter_url.save() #save into newest chapter try: NewestChapter.objects.create(chapter_title = chapter, charpter_url = newest_chapter_url, fiction = fic, source = web_site, index = 0, ) except: continue
def add(self, tid, title, field, score = 0): '''建立索引''' # 数据检查 if tid is None or tid == '' \ or title is None or title == '' \ or field is None or field == '': return False # 原始数据 self.tid = tid self.title = title self.field = field self.data = {'tid' : self.tid, 'title' : self.title, 'field' : self.field} self.score = score # 原始数据保存到Hash # self.r.hset(self.field, self.tid, sj.dumps(self.data)) self.pipeline.hset(self.field, self.tid, sj.dumps(self.data)) # 采用下面的方法可以直接通过sort命令的get参数获取数据 # self.pipeline.set('%s:%s' % (self.field, self.tid), sj.dumps(self.data)) # 分词 algor = mmseg.Algorithm(self.title) words = [] for tok in algor: # 不区分大小写 word = tok.text.decode('utf-8').lower() words.append(word) # 建立倒排索引,以分词为key #self.r.sadd('%s:%s' % (self.field, word), self.tid) self.pipeline.sadd('%s:%s' % (self.field, word), self.tid) # 用于搜索结果排序的score # self.r.set('%s:score:%s' % (self.field, self.tid), self.score) self.pipeline.set('%s:score:%s' % (self.field, self.tid), self.score) # 前缀索引 if self.config.prefix_index_enable is True: # 不区分大小写 word = self.title.decode('utf-8').lower() # 前缀索引不包括分词内容 del words[:] words.append(word) # 建立倒排索引,以分词为key # self.r.sadd('%s:%s' % (self.field, word), self.tid) self.pipeline.sadd('%s:%s' % (self.field, word), self.tid) dic = [] for w in words: for i in range(len(w)): prefix = w[:i+1] dic.append(prefix) dic.append(0.0) #print prefix.encode('utf-8') # 完整的词增加一项,用*号区分 prefix = '%s%s' % (w, '*') dic.append(prefix) dic.append(0.0) # self.r.zadd('compl:%s' % (self.field), *dic) self.pipeline.zadd('compl:%s' % (self.field), *dic) self.pipeline.execute() return True
class Search(object): '''搜索类''' def __init__(self, *args, **kwargs): # 参数检查 if args: if len(args) % 2 != 0: raise ParameterError( "Config requires an equal number of values and scores") # 动态初始化实例变量 for i in range(len(args) / 2): setattr(self, args[i * 2], args[i * 2 + 1]) for key in kwargs: setattr(self, key, kwargs[key]) # redis pool = redis.ConnectionPool(host=self.config.redis['host'], port=self.config.redis['port'], db=self.config.redis['db']) self.r = redis.Redis(connection_pool=pool) # self.r = redis.StrictRedis(host=self.config.redis['host'], port=self.config.redis['port'], db=self.config.redis['db']) self.pipeline = self.r.pipeline() # 加载分词 mmseg.dict_load_defaults() def query(self, field, keyword, count): '''关键词搜索''' results = [] if keyword is None or keyword == '': return results # 转成UTF-8 try: keyword = keyword.encode('utf-8') except UnicodeDecodeError, e: pass max_length = max(50, count) keyword = keyword.lower() # 分词 algor = mmseg.Algorithm(keyword) for tok in algor: # 不区分大小写 word = tok.text.decode('utf-8').lower() results.append(word) # 所有关键词结果的交集的key temp_store_key = 'tmpinter:%s:%s' % (field, '+'.join(results)) # 关键词结果的并集的缓存不存在 if self.r.exists(temp_store_key) is False: # 关键词结果的交集缓存 cnt = self.r.sinterstore( temp_store_key, ['%s:%s' % (field, result) for result in results]) if cnt == 0: return [] # 缓存时间 self.r.expire(temp_store_key, 60 * 5) # 如果建立了相关索引,可以通过sort直接获取数据 # return self.r.sort(temp_store_key, by='%s:score:%s' % (field, '*'), get='%s:%s' % (field, '*'), start=0, num=count, desc=True) # 获取id ids = self.r.sort(temp_store_key, by='%s:score:%s' % (field, '*'), start=0, num=count, desc=True) # 获取数据 return self.r.hmget(field, ids)
def search(request): hot_keys = get_hot_keys(40) r.incr('visit:search:count') visit_counts() suggestion = search_suggestion(db['product']) mmseg_keys = [] collection = db['product'] access_token = request.session.get('access_token', None) expires_in = request.session.get('expires_in', None) uid = request.session.get('uid', None) profile = db['user'].find({"_id": uid}) if profile.count() == 0: profile = None else: profile = profile[0] #chinese word segmentation engine mmseg.Dictionary.load_dictionaries() db_size = collection.count() template = loader.get_template('search.html') keywords = request.GET['keys'].encode('utf-8') current_url = request.path + '?keys=' + keywords seg_keys = [] patterns = [] search_str = '' mmseg_keys_temp = mmseg.Algorithm(keywords) for tok in mmseg_keys_temp: mmseg_keys.append(tok.text) if len(keywords) > 30: algor = mmseg.Algorithm(keywords) for tok in algor: seg_keys.append(tok.text) patterns.append(re.compile('.*%s.*' % tok.text)) search_str = search_str + '.*' + tok.text + '.*|' else: algor = keywords.split(' ') for tok in algor: #add to redis server statics seg_keys.append(tok.strip()) patterns.append(re.compile('.*%s.*' % tok.strip())) search_str = search_str + '.*' + tok + '.*|' #restrict search result_list = collection.find({"ProductName": {"$all": patterns}}) if result_list.count() == 0: #restrict search return none,then use the loose search method search_str = search_str.rstrip('|') pattern = re.compile(search_str) result_list = collection.find({"ProductName": pattern}) if keywords.strip() == '': result_list = None if result_list and result_list.count() >= 1: algor = keywords.split(' ') for tok in algor: try: if tok.strip() != '': r.zincrby('search_keywords', tok) except: print 'error redis search static' after_range_num = 3 befor_range_num = 4 try: page = int(request.GET.get("page", 1)) if page < 1: page = 1 except ValueError: page = 1 paginator = Paginator(result_list, 10) try: search_result = paginator.page(page) except (EmptyPage, InvalidPage, PageNotAnInteger): search_result = paginator.page(paginator.num_pages) if page >= after_range_num: page_range = paginator.page_range[page - after_range_num:page + befor_range_num] else: page_range = paginator.page_range[0:int(page) + befor_range_num] else: algor = keywords.split(' ') for tok in algor: r.zincrby('search_keywords_not_exist', tok) search_result = None page_range = None most_like_item = get_most_like_items() MostLikeList = [] for mll in most_like_item: try: rresult = db['product'].find({"ProductID": mll})[0] except: rresult = None if rresult: recommend = { "pid": rresult['ProductID'], "cover": rresult['MorePhotos'], "title": rresult['ProductName'], "price": rresult['ProductPrice'] } MostLikeList.append(recommend) if len(MostLikeList) == 0: MostLikeList = None params = Context({ "MostLikeList": MostLikeList, "mmseg_keys": mmseg_keys, "hotkeys": hot_keys, "current_url": current_url, 'page_range': page_range, 'userProfile': profile, 'result_list': search_result, 'instant_search': suggestion, 'search_key_words': seg_keys, 'system_version': version, 'database_size': db_size }) return HttpResponse(template.render(params))