def oninit(self, url): url = self.url_suf + url res = self.downloader.download(url) if not res: return cont = res.content uid = re.search("CONFIG\['oid'\]='(.*?)'", cont).group(1) nick = re.search("CONFIG\['onick'\]='(.*?)'", cont).group(1).decode('utf-8') # ajax 请求微博时要用 page_id = re.search("CONFIG\['page_id'\]='(.*?)'", cont).group(1) domain = re.search("CONFIG\['domain'\]='(.*?)'", cont).group(1) location = re.search("CONFIG\['location'\]='(.*?)'", cont).group(1) nums = Parser.parse_index(cont) if not nums: return follow_num, fans_num, wb_num = nums # 验证该用户关注粉丝微博数是否变化 # 如变化更新信息,微博变化则需要爬取更新的微博 # 该方法返回需要爬取的微博数 crawl_info, wb_num = self.validater.validate_nums(page_id, nums) self.crawl_info = crawl_info return { 'uid': uid, 'page_id': page_id, 'nick': nick, 'domain': domain, 'location': location, 'follow_num': int(follow_num), 'fans_num': int(fans_num), 'wb_num': int(wb_num) }