예제 #1
0
    def get_shop_comment(self, url, shopid, offset, page):
        pprint('get_shop_comment offset:{} shopid:{}'.format(offset, shopid))
        self.logger.debug('get_shop_comment offset:{} shopid:{}'.format(offset, shopid))
        sleep(random.randint(1, 5))
        print('开始爬取shop_comment第{}页......,url为{}'.format(page, url))
        self.logger.debug('开始爬取shop_comment第{}页......,url为{}'.format(page, url))
        # html = requests.get(url, headers=headers, proxies=p)
        json_data = html_from_uri(url)
        print(json_data)
        self.logger.debug(json_data)
        # dict_data = re.findall('RawParsed(.*){"tags": ', html)[0]
        dict_data = json.loads(json_data)
        shop_tag = ShopTagItem()
        for tag in dict_data['tags']:
            shop_tag['id'] = calc_md5(tag)
            shop_tag['shopId'] = shopid
            shop_tag['tag'] = tag.get('tag')
            shop_tag['count'] = tag.get('count')
            shop_tag['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            yield shop_tag
        if 'comments' in dict_data and dict_data['comments']:
            comment_groups = dict_data['comments']
            shop_comment = ShopCommentItem()
            for comment in comment_groups:
                shop_comment['id'] = calc_md5(comment)
                shop_comment['shopId'] = shopid
                shop_comment['userName'] = comment.get('userName')
                shop_comment['userUrl'] = comment.get('userUrl')
                shop_comment['avgPrice'] = comment.get('avgPrice')
                shop_comment['comment'] = comment.get('comment')
                shop_comment['merchantComment'] = comment.get('merchantComment')
                shop_comment['picUrls'] = comment.get('picUrls')
                shop_comment['commentTime'] = comment.get('commentTime')
                shop_comment['replyCnt'] = comment.get('replyCnt')
                shop_comment['zanCnt'] = comment.get('zanCnt')
                shop_comment['readCnt'] = comment.get('readCnt')
                shop_comment['userLevel'] = comment.get('userLevel')
                shop_comment['userId'] = comment.get('userId')
                shop_comment['uType'] = comment.get('uType')
                shop_comment['star'] = comment.get('star')
                shop_comment['quality'] = comment.get('quality')
                shop_comment['alreadyZzz'] = comment.get('alreadyZzz')
                shop_comment['reviewId'] = comment.get('reviewId')
                shop_comment['menu'] = comment.get('menu')
                shop_comment['did'] = comment.get('did')
                shop_comment['dealEndtime'] = comment.get('dealEndtime')
                shop_comment['anonymous'] = comment.get('anonymous')
                shop_comment['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                yield shop_comment

            offset += 10
            next_url = self.goods_comment_baseurl.format(shopid=shopid, offset=offset)
            # next_url = 'https://www.meituan.com/ptapi/poi/getcomment?id=41039567&offset=' + str(
            #     offset) + '&pageSize=10&sortType=0'
            page = page + 1
            self.get_shop_comment(next_url, shopid, offset, page)
예제 #2
0
    def get_goods_comment(self, url, dealid, offset, page):
        pprint('get_goods_comment offset:{} dealid:{}'.format(offset, dealid))
        self.logger.debug('get_goods_comment offset:{} dealid:{}'.format(offset, dealid))
        sleep(random.randint(1, 5))
        print('开始爬取goods_comment第{}页......,url为{}'.format(page, url))
        self.logger.debug('开始爬取goods_comment第{}页......,url为{}'.format(page, url))
        # html = requests.get(url, headers=headers, proxies=p)
        json_data = html_from_uri(url)
        pprint(json_data)
        self.logger.debug(json_data)
        # #从直接浏览器得到的结果看需要正则提取,但用requests_html库返回的数据已经是json格式了,所以注释了
        # dict_data = re.findall('RawParsed(.*){"tags": ', html)[0]
        dict_data = json.loads(json_data)
        goods_tag = GoodsTagItem()
        for tag in dict_data['tags']:
            goods_tag['id'] = calc_md5(tag)
            goods_tag['goodsId'] = dealid
            goods_tag['content'] = tag.get('content')
            goods_tag['count'] = tag.get('count')
            goods_tag['isPositive'] = tag.get('isPositive')
            goods_tag['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            yield goods_tag
        if 'list' in dict_data:
            comment_groups = dict_data['list']
            goods_comment = GoodsCommentItem()
            for comment in comment_groups:
                goods_comment['id'] = calc_md5(comment)
                goods_comment['goodsId'] = dealid
                goods_comment['content'] = comment.get('content')
                picUrls = comment.get('picUrls')
                goods_comment['picUrls'] = ', '.join(picUrl for picUrl in picUrls) if picUrls else None
                goods_comment['modTime'] = comment.get('modTime')
                goods_comment['star'] = comment.get('star') / 10
                user = comment.get('user')
                goods_comment['userName'] = user.get('userName')
                goods_comment['isAnonymous'] = user.get('isAnonymous')
                goods_comment['imgUrl'] = user.get('imgUrl')
                goods_comment['shopTitle'] = comment.get('poi').get('title')
                goods_comment['recordCount'] = dict_data.get('recordCount')
                goods_comment['startIndex'] = dict_data.get('startIndex')
                goods_comment['nextStartIndex'] = dict_data.get('nextStartIndex')
                goods_comment['isEnd'] = dict_data.get('isEnd')
                goods_comment['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                yield goods_comment

            offset += 10
            next_url = self.goods_comment_baseurl.format(dealid=dealid, offset=offset)
            # next_url = 'https://i.meituan.com/general/platform/mttgdetail/mtdealcommentsgn.json?dealid=39366931&limit=10&offset=' + str(
            #     offset) + '&sorttype=1&tag='
            page = page + 1
            self.get_goods_comment(next_url, dealid, offset, page)
예제 #3
0
    def parse_shop_comment(self, response):
        pprint('parse_shop_comment response.text:{}'.format(response.text))
        self.logger.debug('parse_shop_comment response.text:{}'.format(response.text))
        shop_comment_url = response.url
        shopid = response.meta.get('shopid')
        offset = response.meta.get('offset')
        page = response.meta.get('page')
        pprint('parse_shop_comment goods_comment_url:{}'.format(shop_comment_url))
        self.logger.debug('parse_shop_comment shop_comment_url:{}'.format(shop_comment_url))

        pprint('get_shop_comment offset:{} shopid:{}'.format(offset, shopid))
        self.logger.debug('get_shop_comment offset:{} shopid:{}'.format(offset, shopid))
        sleep(random.randint(1, 5))
        print('开始爬取shop_comment第{}页......,url为{}'.format(page, shop_comment_url))
        self.logger.debug('开始爬取shop_comment第{}页......,url为{}'.format(page, shop_comment_url))
        # json_data = html_from_uri(shop_comment_url)
        json_data = response.text
        print(json_data)
        self.logger.debug(json_data)
        # dict_data = re.findall('RawParsed(.*){"tags": ', html)[0]
        dict_data = json.loads(json_data)
        shop_tag = ShopTagItem()
        if 'tags' in dict_data and dict_data['tags']:
            for tag in dict_data['tags']:
                shop_tag['id'] = calc_md5(tag)
                shop_tag['shopId'] = shopid
                shop_tag['tag'] = tag.get('tag')
                shop_tag['count'] = tag.get('count')
                shop_tag['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                yield shop_tag
        if 'comments' in dict_data and dict_data['comments']:
            comment_groups = dict_data['comments']
            shop_comment = ShopCommentItem()
            for comment in comment_groups:
                shop_comment['id'] = calc_md5(comment)
                shop_comment['shopId'] = shopid
                shop_comment['userName'] = comment.get('userName')
                shop_comment['userUrl'] = comment.get('userUrl')
                shop_comment['avgPrice'] = comment.get('avgPrice')
                shop_comment['comment'] = comment.get('comment')
                shop_comment['merchantComment'] = comment.get('merchantComment')
                shop_comment['picUrls'] = str(comment.get('picUrls'))
                shop_comment['commentTime'] = timestamp_to_mytime(comment.get('commentTime'))
                shop_comment['replyCnt'] = comment.get('replyCnt')
                shop_comment['zanCnt'] = comment.get('zanCnt')
                shop_comment['readCnt'] = comment.get('readCnt')
                shop_comment['userLevel'] = comment.get('userLevel')
                shop_comment['userId'] = comment.get('userId')
                shop_comment['uType'] = comment.get('uType')
                shop_comment['star'] = comment.get('star')
                shop_comment['quality'] = comment.get('quality')
                shop_comment['alreadyZzz'] = comment.get('alreadyZzz')
                shop_comment['reviewId'] = comment.get('reviewId')
                shop_comment['menu'] = comment.get('menu')
                shop_comment['did'] = comment.get('did')
                shop_comment['dealEndtime'] = timestamp_to_mytime(comment.get('dealEndtime'))
                shop_comment['anonymous'] = comment.get('anonymous')
                shop_comment['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                yield shop_comment

            offset += 10
            next_url = self.shop_comment_baseurl.format(shopid=shopid, offset=offset)
            page = page + 1
            yield scrapy.Request(next_url, callback=self.parse_shop_comment,
                                 meta={'shopid': shopid, 'offset': offset, 'page': page})
            pprint('33333333333333333333333333333333333get_shop_comment offset:{} shopid:{}'.format(offset, shopid))
            self.logger.debug('33333333333333333333333333333333333333get_shop_comment offset:{} shopid:{}'.format(offset, shopid))
예제 #4
0
    def parse_goods_comment(self, response):
        pprint('parse_goods_comment response.text:{}'.format(response.text))
        self.logger.debug('parse_goods_comment response.text:{}'.format(response.text))
        goods_comment_url = response.url
        dealid = response.meta.get('dealid')
        offset = response.meta.get('offset')
        page = response.meta.get('page')
        pprint('parse_goods_comment goods_comment_url:{}'.format(goods_comment_url))
        self.logger.debug('parse_goods_comment goods_comment_url:{}'.format(goods_comment_url))

        pprint('get_goods_comment offset:{} dealid:{}'.format(offset, dealid))
        self.logger.debug('get_goods_comment offset:{} dealid:{}'.format(offset, dealid))
        sleep(random.randint(1, 5))
        print('开始爬取goods_comment第{}页......,url为{}'.format(page, goods_comment_url))
        self.logger.debug('开始爬取goods_comment第{}页......,url为{}'.format(page, goods_comment_url))
        # json_data = html_from_uri(goods_comment_url)
        json_data = response.text
        pprint(json_data)
        self.logger.debug(json_data)
        # #从直接浏览器得到的结果看需要正则提取,但用requests_html库返回的数据已经是json格式了,所以注释了
        # dict_data = re.findall('RawParsed(.*){"tags": ', html)[0]
        dict_data = json.loads(json_data)
        goods_tag = GoodsTagItem()
        if 'tags' in dict_data and dict_data['tags']:
            for tag in dict_data['tags']:
                goods_tag['id'] = calc_md5(tag)
                goods_tag['goodsId'] = dealid
                goods_tag['content'] = tag.get('content')
                goods_tag['count'] = tag.get('count')
                goods_tag['isPositive'] = tag.get('isPositive')
                goods_tag['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                yield goods_tag
        if 'list' in dict_data and dict_data['list']:
            comment_groups = dict_data['list']
            goods_comment = GoodsCommentItem()
            for comment in comment_groups:
                goods_comment['id'] = calc_md5(comment)
                goods_comment['goodsId'] = dealid
                goods_comment['content'] = comment.get('content')
                picUrls = comment.get('picUrls')
                goods_comment['picUrls'] = ', '.join(picUrl for picUrl in picUrls) if picUrls else None
                goods_comment['modTime'] = re.sub(r'\D', "-", comment.get('modTime')[:-1])
                goods_comment['star'] = comment.get('star') / 10
                user = comment.get('user')
                goods_comment['userName'] = user.get('userName')
                goods_comment['isAnonymous'] = user.get('isAnonymous')
                goods_comment['imgUrl'] = user.get('imgUrl')
                goods_comment['shopTitle'] = comment.get('poi').get('title')
                goods_comment['recordCount'] = dict_data.get('recordCount')
                goods_comment['startIndex'] = dict_data.get('startIndex')
                goods_comment['nextStartIndex'] = dict_data.get('nextStartIndex')
                goods_comment['isEnd'] = dict_data.get('isEnd')
                goods_comment['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                yield goods_comment

            offset += 10
            next_url = self.goods_comment_baseurl.format(dealid=dealid, offset=offset)
            page += 1
            yield scrapy.Request(next_url, callback=self.parse_goods_comment,
                                 meta={'dealid': dealid, 'offset': offset, 'page': page})
            pprint('4444444444444444444444444444444444444444444get_goods_comment offset:{} dealid:{}'.format(offset, dealid))
            self.logger.debug('444444444444444444444444444444444444444get_goods_comment offset:{} dealid:{}'.format(offset, dealid))