Пример #1
0
    def parse_req(self, response):
        logger.info("{} Url {}".format(get_function_name(), response.url))

        try:
            user_id = response.meta['user_id']
            page = response.meta['page']

            # 从body中提取有效json
            body = response.body
            body = body.decode('utf-8')

            beg = body.find("(")
            body = body[beg + 1:-1]

            dat = json.loads(body)
            doing_array = dat['doingArray']
            contents = ""

            # 解析状态
            for doing in doing_array:
                cont = doing['content']
                cont = re.sub(self.pt_html_tag, " ", cont)
                print(contents)
                cont += "\r\n"
                contents += cont

            if len(contents) == 0:
                logger.info("# END  USER_ID:    {}  ,PAGE:  {}".format(user_id, page))
                return

            # 保存状态
            item = RenrenStatusItem()
            item['url'] = response.url
            item['field'] = time.strftime("%Y%m%d%H")
            item['title'] = user_id + "_" + page
            item['content'] = contents
            logger.info("item :     ", item)
            yield item

            # 本页状态解析成功,则发起下一页的请求
            if int(page) < self.MAX_PAGE_SIZE:
                request_url = r'http://status.renren.com/GetSomeomeDoingList.do?userId={}' \
                              r'&curpage={}&_jcb=jQuery111108476907948285053_1555050827422' \
                              r'&requestToken=-1639220190&_rtk=6a0c7a7c&_=1555050827426'.format(user_id, int(page) + 1)
                yield Request(url=request_url, cookies=self.default_cookie, headers=self.default_headers, method='GET',
                              callback=self.parse_req, meta={'user_id': user_id, 'page': str(int(page) + 1)})
        except Exception as e:
            traceback.print_exc()
            print("{} request URL {} failure Reason  {}".format(get_function_name(), response.url, e))
            logger.error("{} request URL {} failure Reason  {}".format(get_function_name(), response.url, e))
Пример #2
0
    def parse_pub_total(self, response):
        """
        解析关注者
        :param response:
        :return:
        """
        logger.info("{} Url {}".format(get_function_name(), response.url))

        user_id = response.meta['user_id']

        pub_total = response.xpath(
            "//li[@class='select']/span/text()").extract_first()
        pub_total = int(pub_total)
        offset_total = math.ceil(pub_total / 10.0)

        for i in range(0, offset_total + 1):
            offset = int(i * 10)
            request_url = self.gen_pub_list(user_id=user_id,
                                            visit_id=self.default_visit_id,
                                            offset=offset)
            yield Request(url=request_url,
                          cookies=self.default_cookie,
                          headers=self.default_headers,
                          method='GET',
                          callback=self.parse_follower_page)
Пример #3
0
    def parse_blog_content(self, response):
        """
        解析日志内容
        :param response:
        :return:
        """
        logger.info("{} Url {}".format(get_function_name(), response.url))
        try:
            user_id = response.meta['user_id']
            blog_id = response.meta['blog_id']

            content = response.xpath("//div[@id='blogContent']/descendant-or-self::text()").extract()
            if content is None:
                content = response.xpath("//p/descendant-or-self::text()").extract()
            content = self.pretty.pretty_contents(content)
            content = "\r\n".join(content)
            # 内容为空返回
            if content is None or len(content) == 0:
                return

            # 保存博客
            item = RenrenBlogItem()
            item['url'] = response.url
            item['spider'] = self.name
            item['field'] = time.strftime("%Y%m%d%H")
            item['title'] = user_id + "_" + blog_id
            item['content'] = content
            yield item

            # 发起博客评论请求
            request_url = r'http://comment.renren.com/comment/xoa2?limit=20' \
                          r'&desc=true&offset=0&replaceUBBLarge=true&type=blog' \
                          r'&entryId={}&entryOwnerId={}' \
                          r'&&requestToken=-1639220190&_rtk=6a0c7a7c'.format(blog_id, user_id)
            yield Request(url=request_url, cookies=self.default_cookie, headers=self.default_headers,
                          method='GET',
                          callback=self.parse_blog_comment, meta={'user_id': user_id, 'blog_id': blog_id})

        except Exception as e:
            traceback.print_exc()
            print("{} request URL {} failure Reason  {}".format(get_function_name(), response.url, e))
            logger.error("{} request URL {} failure Reason  {}".format(get_function_name(), response.url, e))
Пример #4
0
    def parse_blog_page(self, response):
        """
        解析日志分页
        :param response:
        :return:
        """
        logger.info("{} Url {}".format(get_function_name(), response.url))

        try:
            user_id = response.meta['user_id']
            page = response.meta['page']
            dat = json.loads(response.body)
            dat = dat['data']
            if dat is None or len(dat) == 0:
                return

            for blog_info in dat:
                blog_id = int(blog_info['id'])
                # 发起 博客内容请求
                request_url = 'http://blog.renren.com/blog/{}/{}?bfrom=01020110200'.format(user_id, blog_id)
                yield Request(url=request_url, cookies=self.default_cookie, headers=self.default_headers,
                              method='GET',
                              callback=self.parse_blog_content,
                              meta={'user_id': user_id,
                                    'page': str(page),
                                    'blog_id': str(blog_id)})

            # 发起 下一页博客请求
            page = int(page) + 1
            request_url = r'http://blog.renren.com/blog/{}/blogs?categoryId= &curpage={}&null&requestToken=-1639220190&_rtk=6a0c7a7c' \
                .format(user_id, page)
            yield Request(url=request_url, cookies=self.default_cookie, headers=self.default_headers,
                          method='GET',
                          callback=self.parse_blog_page, meta={'user_id': user_id, 'page': str(page)})

        except Exception as e:
            traceback.print_exc()
            print("{} request URL {} failure Reason  {}".format(get_function_name(), response.url, e))
            logger.error("{} request URL {} failure Reason  {}".format(get_function_name(), response.url, e))
Пример #5
0
    def parse_blog_comment(self, response):
        """
        解析日志评论
        :param response:
        :return:
        """
        logger.info("{} Url {}".format(get_function_name(), response.url))

        try:
            dat = json.loads(response.body)

            # 解析评论
            comments = dat['comments']
            contents = []
            for comment in comments:
                content = comment['content']
                content = re.sub(self.pt_html_tag, " ", content)
                contents.append(content)
            contents = self.pretty.pretty_contents(contents)
            contents = "\r\n".join(contents)
            if len(contents) == 0:
                return

            # 保存评论
            user_id = response.meta['user_id']
            blog_id = response.meta['blog_id']
            item = RenrenBlogItem()
            item['spider'] = self.name
            item['url'] = response.url
            item['field'] = time.strftime("%Y%m%d%H")
            item['title'] = user_id + "_" + blog_id + "_comments"
            item['content'] = contents
            logger.info("comment item:  ", item)
            yield item

        except Exception as e:
            traceback.print_exc()
            print("{} request URL {} failure Reason  {}".format(get_function_name(), response.url, e))
            logger.error("{} request URL {} failure Reason  {}".format(get_function_name(), response.url, e))
Пример #6
0
    def request_sub_list(self, *, user_id):
        """
        请求关注页
        :param user_id:
        :return:
        """
        logger.info('{} sub {}'.format(get_function_name(), user_id))
        # meta = {
        #     'user_id': user_id,
        # }
        sub_list_url = r'http://follow.renren.com/list/{}/sub/v7'.format(
            user_id)

        return Request(url=sub_list_url,
                       cookies=self.default_cookie,
                       headers=self.default_headers,
                       method='GET',
                       callback=self.parse_sub_total,
                       meta={
                           'user_id': user_id,
                       })
Пример #7
0
    def parse_follower_page(self, response):
        """
        解析返回json关注
        :param response:
        :return:
        """
        logger.info("{} Url {}".format(get_function_name(), response.url))
        try:
            datas = json.loads(response.body)
            if 'publisherCount' in datas['data']:
                logger.info("# 1 current process publish list")
            elif 'subscriberCount' in datas['data']:
                logger.info("# 2current process subscribe list")
            else:
                logger.error("#3 unexpected response type")

            users = datas['data']['userList']
            # name_list = []
            for user in users:
                user_id = user['id']
                user_name = user['name']

                # redis 用户id去重
                if redis_helper.bitmap_contains(key=USER_ID_BITS,
                                                offset=int(user_id)):
                    logger.info("exists user_id ", user_id)
                    continue
                redis_helper.bitmap_set(key=USER_ID_BITS, offset=int(user_id))

                # 写id到文件
                self.user_id_list_cache.append(str(user_id))
                if len(self.user_id_list_cache) > self.cache_size:
                    file_name = self.gen_file_name()
                    with open(file_name, 'a+') as f:
                        f.write("\r\n".join(self.user_id_list_cache))
                        f.write("\r\n")
                        self.user_id_list_cache.clear()

                # 继续请求用户id
                # yield self.request_pub_list(user_id=user_id)

                # yield self.request_sub_list(user_id=user_id)

                # meta = {
                #     'user_id': user_id,
                # }
                sub_list_url = r'http://follow.renren.com/list/{}/sub/v7'.format(
                    user_id)

                yield Request(url=sub_list_url,
                              cookies=self.default_cookie,
                              headers=self.default_headers,
                              method='GET',
                              callback=self.parse_sub_total,
                              meta={
                                  'user_id': user_id,
                              })

                pub_list_url = r'http://follow.renren.com/list/{}/pub/v7'.format(
                    user_id)

                yield Request(url=pub_list_url,
                              cookies=self.default_cookie,
                              headers=self.default_headers,
                              method='GET',
                              callback=self.parse_pub_total,
                              meta={
                                  'user_id': user_id,
                              })

                # name_list.append(user_name)
            # print('get new pub new: ', name_list)

            # if len(name_list) == 0:
            #     return
        except Exception as e:
            logger.error("{} request URL {} failure Reason  {}".format(
                get_function_name(), response.url, e))
            logger.error("traceback {}".format(traceback.print_exc()))