def processSection( self, section, parent, ignore_errors = False ): #log.info(section.text[:128]) try: if section.name == '[document]' and section.P1group: section = section.P1group.P1 pre = section.parent.Title.text # now, regardless of how the soup is obtained or passed in, we have a common starting point. # we will reference parent nodes to obtain information outside the P1group. elif section.P1: section = section.P1 pre = '' else: print section id = utf8(section['id']) #find references refs = section.findParent("[document]").select("Commentary CitationSubRef[SectionRef={}]".format(id)) meta = '' for ref in refs: meta += ref.parent.text text = section.text except KeyboardInterrupt: raise except: if ignore_errors: log.info('Ignoring section in {}'.format(parent.cid)) else: raise code = self.getChild( id, parent, depth=3 ) code.meta = utf8(meta) code.setPre(pre) code.setText(text) return code
def urlencode(data): params = [] for key, value in data.items(): if value == None: continue params.append("%s=%s" % (util.utf8(key), quote_plus(util.utf8(value)))) params_string = '&'.join(params) return params_string
def request(self, method, url, headers, post_data=None): s = util.StringIO.StringIO() curl = pycurl.Curl() if method == 'get': curl.setopt(pycurl.HTTPGET, 1) elif method == 'post': curl.setopt(pycurl.POST, 1) curl.setopt(pycurl.POSTFIELDS, post_data) else: curl.setopt(pycurl.CUSTOMREQUEST, method.upper()) # pycurl doesn't like unicode URLs curl.setopt(pycurl.URL, util.utf8(url)) curl.setopt(pycurl.WRITEFUNCTION, s.write) curl.setopt(pycurl.NOSIGNAL, 1) curl.setopt(pycurl.CONNECTTIMEOUT, 30) curl.setopt(pycurl.TIMEOUT, 80) curl.setopt(pycurl.HTTPHEADER, ['%s: %s' % (k, v) for k, v in headers.iteritems()]) if self._verify_ssl_certs: curl.setopt( pycurl.CAINFO, os.path.join(os.path.dirname(__file__), 'data/ca-certificates.crt')) else: curl.setopt(pycurl.SSL_VERIFYHOST, False) try: curl.perform() except pycurl.error, e: self._handle_request_error(e)
def processSection(self, section, parent): if 'identifier' not in section.attrs: log.debug('skipping section without identifier...') elif section.has_attr('status') and section['status'] in ('omitted', 'repealed'): log.debug('Skipping section %s because of status: %s' % (section['identifier'], section['status'])) else: code = self.findOrCreateCode(section['identifier']) code.parent = parent code.depth = 3 if section.sourcecredit: credits = section.select('sourcecredit') code.meta = utf8('; '.join(c.text for c in credits)) #rather than iterating through every child node, clear the contents of the nodes #we would like to ignore cleartags = ('ref', 'table', 'notes', 'note' ) for tag in cleartags: ctags = section.select(tag) for ctag in ctags: ctag.clear() code.setContent( section.num.text + section.heading.text, section.text, '' ) self.progress()
def __init__(self, method, url, headers = None, data = None, files = None, debug = False, cookies = None, auto_redirect = False): assert url.startswith('http') url = util.utf8(url) self.url = url self.method = method self.data = data or {} self.files = files self.body = None self.auto_redirect = auto_redirect cookies = cookies or {} for name, value in cookies.items(): cookie_manager.set_cookie(name, value) _split_url = httplib.urlsplit(url) self.host = _split_url.netloc self.uri = _split_url.path if _split_url.query: self.uri += '?' + _split_url.query if _split_url.scheme == 'https': Connection = httplib.HTTPSConnection else: Connection = httplib.HTTPConnection self.__conn = Connection(host = self.host) self.__conn.set_debuglevel(debug and logging.DEBUG or 0) self.headers = headers or {} self.generate_header(headers)
def processAct( self, act, root, bar=None, ignore_errors=False ): # load the contents of each act first, if it has parts, collect each part for processing # if it only has sections, process each part individually contentsurl = 'http://www.legislation.gov.uk/ukpga/{}/contents/data.xml' soup = self.getsoup( contentsurl.format(act) ) parts = soup.select('Legislation > Contents > ContentsPart') sections = soup.select('Legislation > Contents ContentsItem[ContentRef^=section-]') pre = soup.Primary.text if soup.Primary else '' if not any( (parts, sections) ): # no data available for act - we're outta here log.warn('No data available for {}'.format(act)) return None act = self.getChild(act, root) act.rev = root.rev act.setPre( pre ) meta = utf8(soup.title.text) if soup.title else '' schedurl = '' parturls = [] secturls = [] # now we get the schedules - these are included in the post because ... reasons # really it's the only place for them unless we modify the code to accomodate # different code structural items at the same depth if soup.ContentsSchedules: schedurl = 'http://www.legislation.gov.uk/ukpga/{}/schedules/data.xml'.format(act.cid) if parts: for part in parts: parturls.append( '{}/data.xml'.format(part['DocumentURI']) ) elif sections: for section in sections: secturls.append( '{}/data.xml'.format(section['DocumentURI']) ) urls = parturls + secturls if schedurl: urls.append(schedurl) soups = self.get_bulk_soup( urls, bar=bar ) if not bar: bar = progress.Bar() bar.label = act.cid bar.expected_size = len(soups) bar.show(0) if schedurl: schedsoup = soups[schedurl] act.setPost( schedsoup.Schedules.text ) bar.show(bar.last_progress + 1) if parturls: for parturl in parturls: partsoup = soups[parturl] self.processPart( partsoup, act, ignore_errors=ignore_errors ) bar.show(bar.last_progress+1) elif secturls: for secturl in secturls: sectionsoup = soups[secturl] self.processSection(sectionsoup, act, ignore_errors=ignore_errors) bar.show(bar.last_progress+1) return act
def __contains__( self, i ): if type(i) is unicode: i = utf8(i) if self.children().find(Code.cid == i).count(): return True else: childids = Select(Code.id, Code.parent_id == self.id) if Code.find(Code.cid == i, Code.parent_id.is_in(childids) ).count(): return True return False
def __init__(self, name, id, title, score="id", condition_fields=None, prefix_index_enable=True, exts=None, **kwargs): if isinstance(exts, dict): kwargs.update(exts) self.name = name self.title = utf8(title) self.id = id self.score = score self.exts = kwargs self.condition_fields = condition_fields if condition_fields and isinstance(condition_fields, list) else [] self.prefix_index_enable = prefix_index_enable
def __request(self): def utf8_headers(headers): _headers = {} for key, value in headers.items(): _headers[util.utf8(key)] = util.utf8(value) return _headers conn = self.__conn conn.request(util.utf8(self.method), util.utf8(self.uri), body = util.utf8(self.body), headers = utf8_headers(self.headers)) global LAST_URL LAST_URL = self.url response = conn.getresponse() cookie_manager.set_cookie(response.getheader('set-cookie')) if self.auto_redirect and response.getheader('location'): return self.redirect(response.getheader('location')) else: return Response(response)
def findOrCreateCode( self, cid ): cid = utf8(cid) #log.debug("Looking for code identified as %s" % cid) c = store.find(Code, Code.rp == self.rp, Code.cid == cid) if c.is_empty(): c = store.add(Code()) c.nation = self.nation c.rp = self.rp c.cid = cid c.stored = datetime.now() else: c = c[0] return c
def save(self): """docstring for save""" if not self.title: return False data = { 'name': self.name, 'id': self.id, 'title': self.title } if self.exts: data.update(self.exts) pipe = util.redis.pipeline() # 将原始数据存入 hashes res = pipe.hset(self.name, self.id, json.dumps(data)) # 保存 sets 索引,以分词的单词为key,用于后面搜索,里面存储 ids words = self.split_words_for_index(self.title) if not words: logging.info("no words") return False for word in words: key = mk_sets_key(self.name, word) # word index for item id pipe.sadd(key, self.id) if self.score == 'id': self.score = self.id # score for search sort pipe.set(mk_score_key(self.name, self.id), self.score) # 将目前的编号保存到条件(conditions)字段所创立的索引上面 for field in self.condition_fields: pipe.sadd(mk_condition_key(self.name, field, utf8(data[field])), self.id) # commit pipe.execute() if self.prefix_index_enable: self.save_prefix_index()
def save(self): """docstring for save""" if not self.title: return False data = {'name': self.name, 'id': self.id, 'title': self.title} if self.exts: data.update(self.exts) pipe = util.redis.pipeline() # 将原始数据存入 hashes res = pipe.hset(self.name, self.id, json.dumps(data)) # 保存 sets 索引,以分词的单词为key,用于后面搜索,里面存储 ids words = self.split_words_for_index(self.title) if not words: logging.info("no words") return False for word in words: key = mk_sets_key(self.name, word) # word index for item id pipe.sadd(key, self.id) if self.score == 'id': self.score = self.id # score for search sort pipe.set(mk_score_key(self.name, self.id), self.score) # 将目前的编号保存到条件(conditions)字段所创立的索引上面 for field in self.condition_fields: pipe.sadd(mk_condition_key(self.name, field, utf8(data[field])), self.id) # commit pipe.execute() if self.prefix_index_enable: self.save_prefix_index()
def processPart( self, part, parent, ignore_errors=False ): try: part = part.Part id = utf8(part['id']) sections = part.select('P1group P1[id^=section-]') if not sections: log.info( 'ERROR: No sections in {} of {}'.format(part['id'] if part.has_attr('id') else '', parent.cid) ) return None title = (part.Number.text if part.Number else '' + ' ' + part.Title.text if part.Title else '').replace('\n', '') except: log.error( 'ERROR: Unable to get all data for {}'.format(id) ) raise code = self.getChild( id, parent ) code.setPre(title) for section in sections: self.processSection( section.parent, code, ignore_errors=ignore_errors ) return code
def handle(self): request = self._request path = request.path handler = None args = [] for spec in self.handlers: match = spec.regex.match(path) if match: handler = spec.handler_class(self, request) args = [unicode(urllib.unquote_plus(utf8(m)), "utf-8") for m in match.groups()] break if not handler: handler = ErrorHandler(self, request, status_code=404) handler._execute(*args) return handler
def __init__(self, name, id, title, score="id", condition_fields=None, prefix_index_enable=True, exts=None, **kwargs): if isinstance(exts, dict): kwargs.update(exts) self.name = name self.title = utf8(title) self.id = id self.score = score self.exts = kwargs self.condition_fields = condition_fields if condition_fields and isinstance( condition_fields, list) else [] self.prefix_index_enable = prefix_index_enable
def __getitem__( self, i ): if type(i) is unicode: i = utf8(i) child = self.children().find(Code.cid == i).one() if not child: # first try this childids = Select(Code.id, Code.parent_id == self.id) child = Code.find(Code.cid == i, Code.parent_id.is_in(childids) ).one() if child: return child for child in self.children(): c = None try: c = child[i] except: pass if c: return c else: pass raise IndexError("Child code not found with cid {}".format(i)) else: return child
def utf8_join_flatten(items): return "".join(utf8(item) for item in flatten(items))
def processAct( self, actcid, parent ): log.info('Processing act: %s' % actcid) soup = self.getActSoup(actcid) act = self.findOrCreateAct(parent.released, actcid, parent.rev) act.parent = parent act.cid = actcid act.released = parent.released act.rev = parent.rev act.depth = 1 act.pre = Text.make( soup.title.text ) act.text = Text.make( soup.select("section.intro")[0].text ) act.meta = utf8(soup.select("p#assentedDate")[0].text.rpartition('.')[0]) doc = soup.select("div.docContents div")[0] #so much easier to use the CSS selector #sections = [i['id'] for i in doc.select("[id]") if i['id'].startswith('h-')] id_prefix = 'h-' sections = [i['id'] for i in doc.select('[id^={}]'.format(id_prefix))] classAndTag = lambda o: isinstance(o, Tag) and o.has_attr('class') if sections: for secid in progress.bar(sections, label=act.cid): sec = self.findOrCreateSection(act.released, secid, act) soup = doc.select("[id=%s]" % secid)[0] sec.pre = Text.make(soup.text) sec.cid = secid sec.depth=2 sec.parent = act sec.released = act.released sec.rev = act.rev stop = False sib = soup.nextSibling content = "" for t in soup.select(".wb-invisible"): t.clear() while not stop: if classAndTag(sib): if sib.has_attr('id') and sib['id'].startswith('h-'): stop = True elif sib.name == 'section': stop = True elif any( c in ['Definition', 'Section', 'MarginalNote', 'ProvisionList', 'Part', 'Subheading', 'MarginalNoteDefinedTerm', 'ContinuedSectionSubsection', 'Oath'] for c in sib['class']): content += sib.text elif sib['class'][0].startswith('indent'): content += sib.text elif sib['class'][0] == 'HistoricalNote': sec.meta = utf8(sib.text) elif sib['class'][0] in ['PITLink', 'nif']: pass else: log.info('Unhandled case in parsing section %s/%s' % (act.cid, secid)) log.debug(sib.name) log.debug(sib.attrs) if not sib or not sib.nextSibling: stop = True if not stop: sib = sib.nextSibling sec.text = Text.make(content) sec.stored = now() schedules = soup.select('div [class=Schedule]') post = '' for sched in schedules: post += sched.text act.post = Text.make(post) act.stored = now() else: #alternative section method #for this method we switch to the XML version and pull identifying information #out of the code = attribute. Annecdotally, this seems to need to be done for #very small acts log.info('Switching to alternate section method') soup = self.getActXMLSoup(act.cid) sections = soup.select("section[code^=se=]") for section in sections: try: secid = section['code'].replace('=', '-').replace('"', '') pre = '' pre = section.label.text + ' ' if section.label else pre pre = pre + section.marginalnote.text if section.marginalnote else pre text = section.select_one('text').text except: log.warn('ERROR in alternate parsing method for {}.{}'.format(act.cid, secid)) raise if 'repealed' in text.lower(): pass else: sec = self.findOrCreateSection(act.released, secid, act) sec.setPre(pre) sec.setText(text) sec.parent = act sec.depth = 2 sec.released = act.released sec.rev = act.rev sec.cid = secid act.analyze() store.commit() return act
def str(length=None, maxlen=None): """An arbitrary string. UTF-8 encoded.""" while True: yield _util.utf8(_str(length, maxlen))
def query(name, text, offset=0, limit=10, sort_field='id', conditions=None): """docstring for query""" conditions = conditions if isinstance(conditions, dict) and conditions else {} tm = time.time() result = [] # 如果搜索文本和查询条件均没有,那就直接返回 [] if not text.strip() and not conditions: return result text = utf8(text.strip()) splited_words = split_words(text) words = [mk_sets_key(name, word) for word in splited_words] if conditions: condition_keys = [mk_condition_key(name, c, utf8(conditions[c])) for c in conditions] # 将条件的 key 放入关键词搜索集合内,用于 sinterstore 搜索 words += condition_keys else: condition_keys = [] if not words: return result temp_store_key = "tmpinterstore:%s" % "+".join(words) if len(words) > 1: if not util.redis.exists(temp_store_key): # 将多个词语组合对比,得到交集,并存入临时区域 util.redis.sinterstore(temp_store_key, words) # 将临时搜索设为1天后自动清除 util.redis.expire(temp_store_key, 86400) # 拼音搜索 if util.pinyin_match: splited_pinyin_words = split_pinyin(text) pinyin_words = [mk_sets_key(name, w) for w in splited_pinyin_words] pinyin_words += condition_keys temp_sunion_key = "tmpsunionstore:%s" % "+".join(words) temp_pinyin_store_key = "tmpinterstore:%s" % "+".join(pinyin_words) # 找出拼音的交集 util.redis.sinterstore(temp_pinyin_store_key, pinyin_words) # 合并中文和拼音的搜索结果 util.redis.sunionstore(temp_sunion_key, [temp_store_key, temp_pinyin_store_key]) # 将临时搜索设为1天后自动清除 util.redis.expire(temp_pinyin_store_key, 86400) util.redis.expire(temp_sunion_key, 86400) temp_store_key = temp_sunion_key else: temp_store_key = words[0] # 根据需要的数量取出 ids ids = util.redis.sort(temp_store_key, start=offset, num=limit, by=mk_score_key(name, "*"), desc=True) result = hmget(name, ids, sort_field=sort_field) logging.debug("{}:\"{}\" | Time spend:{}s".format(name, text, time.time()-tm)) return result
def utf8_headers(headers): _headers = {} for key, value in headers.items(): _headers[util.utf8(key)] = util.utf8(value) return _headers
def complete(name, keyword, limit=10, conditions=None): """docstring for complete""" conditions = conditions if isinstance(conditions, dict) and conditions else {} if not keyword and not conditions: logging.debug("no word and conditions") return [] keyword = utf8(keyword.strip()) prefix_matchs = [] # This is not random, try to get replies < MTU size rangelen = util.complete_max_length prefix = keyword.lower() key = mk_complete_key(name) start = util.redis.zrank(key, prefix) if start: count = limit max_range = start+(rangelen*limit)-1 entries = util.redis.zrange(key, start, max_range) while len(prefix_matchs) <= count: start += rangelen if not entries or len(entries) == 0: break for entry in entries: minlen = min(len(entry), len(prefix)) if entry[0:minlen] != prefix[0:minlen]: count = len(prefix_matchs) break if entry[-1] == "*" and len(prefix_matchs) != count: match = entry[:-1] if match not in prefix_matchs: prefix_matchs.append(match) entries = entries[start:max_range] # 组合 words 的特别 key 名 words = [] for word in prefix_matchs: words.append(mk_sets_key(name, word)) # 组合特别 key ,但这里不会像 query 那样放入 words, 因为在 complete 里面 words 是用 union 取的,condition_keys 和 words 应该取交集 condition_keys = [] if conditions: for c in conditions: condition_keys.append(mk_condition_key(name, c, utf8(conditions[c]))) # 按词语搜索 temp_store_key = "tmpsunionstore:%s" % "+".join(words) if len(words) == 0: logging.info("no words") elif len(words) > 1: if not util.redis.exists(temp_store_key): # 将多个词语组合对比,得到并集,并存入临时区域 util.redis.sunionstore(temp_store_key, words) # 将临时搜索设为1天后自动清除 util.redis.expire(temp_store_key, 86400) # 根据需要的数量取出 ids else: temp_store_key = words[0] # 如果有条件,这里再次组合一下 if condition_keys: if not words: condition_keys += temp_store_key temp_store_key = "tmpsinterstore:%s" % "+".join(condition_keys) if not util.redis.exists(temp_store_key): util.redis.sinterstore(temp_store_key, condition_keys) util.redis.expire(temp_store_key, 86400) ids = util.redis.sort(temp_store_key, start = 0, num = limit, by = mk_score_key(name, "*"), desc = True) if not ids: return [] return util.hmget(name, ids)
def scrape (self): if not self.rp: r = Cache.get(self.nation.cfg['entrypoint']) soup = BS(str(r)) #find current release point log.info("No release point specified, retreiving latest...") # this failed fantastically - we'll get the RP from the zipurl #self.rp = utf8(soup.findAll('h3', attrs={'class': 'releasepointinformation'})[0].text.split()[-1]) log.info("Found release point %s" % self.rp) #find the download url self.zipurl = self.nation.cfg['entrypoint'].rpartition('/')[0] + '/' + soup.findAll('a', title='All USC Titles in XML')[0]['href'] # new way to set the rp using the zipurl's filename self.rp = utf8( self.zipurl.rpartition('@')[-1].partition('.')[0] ) else: log.info('Using specified release point %s...' % self.rp) # don't actually need this # rpurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/usc-rp@%s.htm' % (tuple(self.rp.split()) + (self.rp,)) self.zipurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/xml_uscAll@%s.zip' % (tuple(self.rp.split('-')) + (self.rp,)) log.debug("Using zipurl: %s" % self.zipurl) class FileNotThere (Exception): pass class XMLNotThere( Exception ): pass class AllGood( Exception ): pass filename = self.zipurl.rpartition('/')[-1] xmldir = self._workdir + os.sep + 'xml' + os.sep # check to see if we have xml that works # if we don't check to see if we have a zip file # if we don't, download it # if we do, extract it # check the xml again, if it's good, proceed # if it's not, error out try: assert os.path.exists(xmldir + 'usc01.xml') soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read()) xmlrp = soup.find('docpublicationname').text.split('@')[-1] #old way to get rp, the new way is much better # xmlrp = soup.title.first("note", topic="miscellaneous").text.split()[-1] if xmlrp == self.rp: raise AllGood else: raise XMLNotThere except (XMLNotThere,AssertionError): # delete directory if it exists if os.path.exists(xmldir): shutil.rmtree(xmldir) # if there's no xml file, download it if not os.path.exists(self._workdir + os.sep + filename): log.info('No zipfile found for release point, downloading...') self.downloadFile(self.zipurl, filename) # now we should have a zipfile and no existing xmldir log.info('Extracting file %s...' % filename) zf = ZipFile(self._workdir + os.sep + filename, 'r') # older release points do not have an interior xml/ dir if not all( [ n.startswith('xml/') for n in zf.namelist()]): zf.extractall(xmldir) else: zf.extractall(self._workdir) # double check the xml now... assert os.path.exists(xmldir + 'usc01.xml') # it may be problematic to rely on the RP information in the XML documents provided # rp 113-21 (the earliest presently available) does not include this in the # docpublicationname meta tag #soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read()) #xmlrp = soup.find('docpublicationname').text.split('@')[-1] #if xmlrp != self.rp: # raise XMLNotThere('XML did not check out after extraction.') except AllGood: pass except: raise log.info('All checks passed...') xf = os.listdir(xmldir) root = self.findOrCreateRoot() xf = [xmldir + f for f in xf if f.endswith('.xml')] xf.sort() log.info("Processing %i files..." % len(xf)) self.bar = progress.Bar(label='US', expected_size=1000*len(xf)) self.progress( i=len(xf) ) for fn in xf: self.processFile(fn, root) self.progress(rollup=1000) log.info('Analyzing code...') self.progress(label="Analyzing") root.analyze(commit=True, bar=self.bar) store.commit() log.info('Scrape completed.')
def complete(name, keyword, limit=10, conditions=None): """complete: prefix match search keyword limit: max match count""" conditions = conditions if isinstance(conditions, dict) and conditions else {} if not keyword and not conditions: logging.debug("no word and conditions") return [] keyword = utf8(keyword.strip()) prefix_matchs = [] # This is not random, try to get replies < MTU size rangelen = util.complete_max_length prefix = keyword.lower() key = mk_complete_key(name) start = util.redis.zrank(key, prefix) if start: count = limit max_range = start + (rangelen * limit) - 1 entries = util.redis.zrange(key, start, max_range) while len(prefix_matchs) <= count: start += rangelen if not entries or len(entries) == 0: break #entries sorted in desc so once entry is inconsistence with prefix will break for entry in entries: minlen = min(len(entry), len(prefix)) #this entry break the consistency with prefix if entry[0:minlen] != prefix[0:minlen]: count = len(prefix_matchs) break # found matched entry if entry[-1] == "*" and len(prefix_matchs) != count: match = entry[:-1] if match not in prefix_matchs: prefix_matchs.append(match) entries = entries[start:max_range] # 组合 words 的特别 key 名 words = [mk_sets_key(name, word) for word in prefix_matchs] # 组合特别key,但这里不会像query那样放入words, # 因为在complete里面words是用union取的,condition_keys和words应该取交集 condition_keys = [mk_condition_key(name, c, utf8(conditions[c])) for c in conditions] # 按词语搜索 temp_store_key = "tmpsunionstore:%s" % "+".join(words) if len(words) == 0: logging.info("no words") elif len(words) > 1: if not util.redis.exists(temp_store_key): # 将多个词语组合对比,得到并集,并存入临时区域 util.redis.sunionstore(temp_store_key, words) # 将临时搜索设为1天后自动清除 util.redis.expire(temp_store_key, 86400) # 根据需要的数量取出 ids else: temp_store_key = words[0] # 如果有条件,这里再次组合一下 if condition_keys: if not words: condition_keys += temp_store_key temp_store_key = "tmpsinterstore:%s" % "+".join(condition_keys) if not util.redis.exists(temp_store_key): util.redis.sinterstore(temp_store_key, condition_keys) util.redis.expire(temp_store_key, 86400) ids = util.redis.sort(temp_store_key, start=0, num=limit, by=mk_score_key(name, "*"), desc=True) if not ids: return [] return hmget(name, ids)
def write(self, chunk): if isinstance(chunk, dict): chunk = json_encode(chunk) self.set_header("Content-Type", "application/json; charset=UTF-8") chunk = utf8(chunk) self._write_buffer.append(chunk)
def set_header(self, key, value): self.headers[util.utf8(key)] = util.utf8(value)
def query(name, text, offset=0, limit=10, sort_field='id', conditions=None): """docstring for query""" conditions = conditions if isinstance(conditions, dict) and conditions else {} tm = time.time() result = [] # 如果搜索文本和查询条件均没有,那就直接返回 [] if not text.strip() and not conditions: return result text = utf8(text.strip()) splited_words = split_words(text) words = [] for word in splited_words: words.append(mk_sets_key(name, word)) condition_keys = [] if conditions: for c in conditions: condition_keys.append(mk_condition_key(name, c, utf8(conditions[c]))) # 将条件的 key 放入关键词搜索集合内,用于 sinterstore 搜索 words += condition_keys if not words: return result temp_store_key = "tmpinterstore:%s" % "+".join(words) if len(words) > 1: if not util.redis.exists(temp_store_key): # 将多个词语组合对比,得到交集,并存入临时区域 util.redis.sinterstore(temp_store_key, words) # 将临时搜索设为1天后自动清除 util.redis.expire(temp_store_key, 86400) # 拼音搜索 if util.pinyin_match: splited_pinyin_words = split_pinyin(text) pinyin_words = [] for w in splited_pinyin_words: pinyin_words.append(mk_sets_key(name, w)) pinyin_words += condition_keys temp_sunion_key = "tmpsunionstore:%s" % "+".join(words) temp_pinyin_store_key = "tmpinterstore:%s" % "+".join(pinyin_words) # 找出拼音的 util.redis.sinterstore(temp_pinyin_store_key, pinyin_words) # 合并中文和拼音的搜索结果 util.redis.sunionstore(temp_sunion_key, [temp_store_key, temp_pinyin_store_key]) # 将临时搜索设为1天后自动清除 util.redis.expire(temp_pinyin_store_key, 86400) util.redis.expire(temp_sunion_key, 86400) temp_store_key = temp_sunion_key else: temp_store_key = words[0] # 根据需要的数量取出 ids ids = util.redis.sort(temp_store_key, start = offset, num = limit, by = mk_score_key(name, "*"), desc = True) result = util.hmget(name, ids, sort_field=sort_field) logging.debug("%s:\"%s\" | Time spend:%ss" % (name, text, time.time()-tm)) return result