def parse_req(self, response): logger.info("{} Url {}".format(get_function_name(), response.url)) try: user_id = response.meta['user_id'] page = response.meta['page'] # 从body中提取有效json body = response.body body = body.decode('utf-8') beg = body.find("(") body = body[beg + 1:-1] dat = json.loads(body) doing_array = dat['doingArray'] contents = "" # 解析状态 for doing in doing_array: cont = doing['content'] cont = re.sub(self.pt_html_tag, " ", cont) print(contents) cont += "\r\n" contents += cont if len(contents) == 0: logger.info("# END USER_ID: {} ,PAGE: {}".format(user_id, page)) return # 保存状态 item = RenrenStatusItem() item['url'] = response.url item['field'] = time.strftime("%Y%m%d%H") item['title'] = user_id + "_" + page item['content'] = contents logger.info("item : ", item) yield item # 本页状态解析成功,则发起下一页的请求 if int(page) < self.MAX_PAGE_SIZE: request_url = r'http://status.renren.com/GetSomeomeDoingList.do?userId={}' \ r'&curpage={}&_jcb=jQuery111108476907948285053_1555050827422' \ r'&requestToken=-1639220190&_rtk=6a0c7a7c&_=1555050827426'.format(user_id, int(page) + 1) yield Request(url=request_url, cookies=self.default_cookie, headers=self.default_headers, method='GET', callback=self.parse_req, meta={'user_id': user_id, 'page': str(int(page) + 1)}) except Exception as e: traceback.print_exc() print("{} request URL {} failure Reason {}".format(get_function_name(), response.url, e)) logger.error("{} request URL {} failure Reason {}".format(get_function_name(), response.url, e))
def parse_pub_total(self, response): """ 解析关注者 :param response: :return: """ logger.info("{} Url {}".format(get_function_name(), response.url)) user_id = response.meta['user_id'] pub_total = response.xpath( "//li[@class='select']/span/text()").extract_first() pub_total = int(pub_total) offset_total = math.ceil(pub_total / 10.0) for i in range(0, offset_total + 1): offset = int(i * 10) request_url = self.gen_pub_list(user_id=user_id, visit_id=self.default_visit_id, offset=offset) yield Request(url=request_url, cookies=self.default_cookie, headers=self.default_headers, method='GET', callback=self.parse_follower_page)
def parse_blog_content(self, response): """ 解析日志内容 :param response: :return: """ logger.info("{} Url {}".format(get_function_name(), response.url)) try: user_id = response.meta['user_id'] blog_id = response.meta['blog_id'] content = response.xpath("//div[@id='blogContent']/descendant-or-self::text()").extract() if content is None: content = response.xpath("//p/descendant-or-self::text()").extract() content = self.pretty.pretty_contents(content) content = "\r\n".join(content) # 内容为空返回 if content is None or len(content) == 0: return # 保存博客 item = RenrenBlogItem() item['url'] = response.url item['spider'] = self.name item['field'] = time.strftime("%Y%m%d%H") item['title'] = user_id + "_" + blog_id item['content'] = content yield item # 发起博客评论请求 request_url = r'http://comment.renren.com/comment/xoa2?limit=20' \ r'&desc=true&offset=0&replaceUBBLarge=true&type=blog' \ r'&entryId={}&entryOwnerId={}' \ r'&&requestToken=-1639220190&_rtk=6a0c7a7c'.format(blog_id, user_id) yield Request(url=request_url, cookies=self.default_cookie, headers=self.default_headers, method='GET', callback=self.parse_blog_comment, meta={'user_id': user_id, 'blog_id': blog_id}) except Exception as e: traceback.print_exc() print("{} request URL {} failure Reason {}".format(get_function_name(), response.url, e)) logger.error("{} request URL {} failure Reason {}".format(get_function_name(), response.url, e))
def parse_blog_page(self, response): """ 解析日志分页 :param response: :return: """ logger.info("{} Url {}".format(get_function_name(), response.url)) try: user_id = response.meta['user_id'] page = response.meta['page'] dat = json.loads(response.body) dat = dat['data'] if dat is None or len(dat) == 0: return for blog_info in dat: blog_id = int(blog_info['id']) # 发起 博客内容请求 request_url = 'http://blog.renren.com/blog/{}/{}?bfrom=01020110200'.format(user_id, blog_id) yield Request(url=request_url, cookies=self.default_cookie, headers=self.default_headers, method='GET', callback=self.parse_blog_content, meta={'user_id': user_id, 'page': str(page), 'blog_id': str(blog_id)}) # 发起 下一页博客请求 page = int(page) + 1 request_url = r'http://blog.renren.com/blog/{}/blogs?categoryId= &curpage={}&null&requestToken=-1639220190&_rtk=6a0c7a7c' \ .format(user_id, page) yield Request(url=request_url, cookies=self.default_cookie, headers=self.default_headers, method='GET', callback=self.parse_blog_page, meta={'user_id': user_id, 'page': str(page)}) except Exception as e: traceback.print_exc() print("{} request URL {} failure Reason {}".format(get_function_name(), response.url, e)) logger.error("{} request URL {} failure Reason {}".format(get_function_name(), response.url, e))
def parse_blog_comment(self, response): """ 解析日志评论 :param response: :return: """ logger.info("{} Url {}".format(get_function_name(), response.url)) try: dat = json.loads(response.body) # 解析评论 comments = dat['comments'] contents = [] for comment in comments: content = comment['content'] content = re.sub(self.pt_html_tag, " ", content) contents.append(content) contents = self.pretty.pretty_contents(contents) contents = "\r\n".join(contents) if len(contents) == 0: return # 保存评论 user_id = response.meta['user_id'] blog_id = response.meta['blog_id'] item = RenrenBlogItem() item['spider'] = self.name item['url'] = response.url item['field'] = time.strftime("%Y%m%d%H") item['title'] = user_id + "_" + blog_id + "_comments" item['content'] = contents logger.info("comment item: ", item) yield item except Exception as e: traceback.print_exc() print("{} request URL {} failure Reason {}".format(get_function_name(), response.url, e)) logger.error("{} request URL {} failure Reason {}".format(get_function_name(), response.url, e))
def request_sub_list(self, *, user_id): """ 请求关注页 :param user_id: :return: """ logger.info('{} sub {}'.format(get_function_name(), user_id)) # meta = { # 'user_id': user_id, # } sub_list_url = r'http://follow.renren.com/list/{}/sub/v7'.format( user_id) return Request(url=sub_list_url, cookies=self.default_cookie, headers=self.default_headers, method='GET', callback=self.parse_sub_total, meta={ 'user_id': user_id, })
def parse_follower_page(self, response): """ 解析返回json关注 :param response: :return: """ logger.info("{} Url {}".format(get_function_name(), response.url)) try: datas = json.loads(response.body) if 'publisherCount' in datas['data']: logger.info("# 1 current process publish list") elif 'subscriberCount' in datas['data']: logger.info("# 2current process subscribe list") else: logger.error("#3 unexpected response type") users = datas['data']['userList'] # name_list = [] for user in users: user_id = user['id'] user_name = user['name'] # redis 用户id去重 if redis_helper.bitmap_contains(key=USER_ID_BITS, offset=int(user_id)): logger.info("exists user_id ", user_id) continue redis_helper.bitmap_set(key=USER_ID_BITS, offset=int(user_id)) # 写id到文件 self.user_id_list_cache.append(str(user_id)) if len(self.user_id_list_cache) > self.cache_size: file_name = self.gen_file_name() with open(file_name, 'a+') as f: f.write("\r\n".join(self.user_id_list_cache)) f.write("\r\n") self.user_id_list_cache.clear() # 继续请求用户id # yield self.request_pub_list(user_id=user_id) # yield self.request_sub_list(user_id=user_id) # meta = { # 'user_id': user_id, # } sub_list_url = r'http://follow.renren.com/list/{}/sub/v7'.format( user_id) yield Request(url=sub_list_url, cookies=self.default_cookie, headers=self.default_headers, method='GET', callback=self.parse_sub_total, meta={ 'user_id': user_id, }) pub_list_url = r'http://follow.renren.com/list/{}/pub/v7'.format( user_id) yield Request(url=pub_list_url, cookies=self.default_cookie, headers=self.default_headers, method='GET', callback=self.parse_pub_total, meta={ 'user_id': user_id, }) # name_list.append(user_name) # print('get new pub new: ', name_list) # if len(name_list) == 0: # return except Exception as e: logger.error("{} request URL {} failure Reason {}".format( get_function_name(), response.url, e)) logger.error("traceback {}".format(traceback.print_exc()))