Exemplo n.º 1
0
 def parse(self, response):
     douban_id = int(response.meta['douban_id'])
     div_list = response.xpath('//div[@class="comment-item"]')
     for div in div_list:
         content = div.xpath(
             './/span[@class="short"]/text()').extract_first()
         if len(content) <= 15:
             continue
         douban_user_nickname = div.xpath(
             './/span[@class="comment-info"]/a/text()').extract_first()
         avatar_src = div.xpath(
             './div[@class="avatar"]/a/img/@src').extract_first()
         douban_user_id = int(re.findall('icon/u(.*?)-', avatar_src)[0])
         comment_time = div.xpath('.//span[@class="comment-time "]/text()'
                                  ).extract_first().strip()
         span_class = div.xpath(
             './/span[@class="comment-info"]/span[2]/@class').extract_first(
             )
         if 'rating' in span_class:
             star_level = int(re.findall('\d+', span_class)[0][0])
         else:
             star_level = None
         votes = int(
             div.xpath('.//span[@class="votes"]/text()').extract_first())
         content_md5 = md5(content.encode('utf-8')).hexdigest()
         comment = Comment()
         comment['douban_id'] = douban_id
         comment['douban_user_nickname'] = douban_user_nickname
         comment['douban_user_id'] = douban_user_id
         comment['comment_time'] = comment_time
         comment['star_level'] = star_level
         comment['votes'] = votes
         comment['content'] = content
         comment['content_md5'] = content_md5
         yield comment
Exemplo n.º 2
0
    def parse(self, response):

        comments = response.xpath(
            'id("comments")/ul//li//p/span/text()').getall()

        for c in comments:
            comment = Comment()
            comment['book_id'] = response.url[32:39]
            comment['comment'] = c
            yield comment
Exemplo n.º 3
0
 def parse(self, response):
     douban_id = response.url.split("/")[-2]
     items = json.loads(response.body)["interests"]
     for item in items:
         comment = Comment()
         comment["douban_id"] = douban_id
         comment["douban_comment_id"] = item["id"]
         comment["douban_user_nickname"] = item["user"]["name"]
         comment["douban_user_avatar"] = item["user"]["avatar"]
         comment["douban_user_url"] = item["user"]["url"]
         comment["content"] = item["comment"]
         comment["votes"] = item["vote_count"]
         yield comment
Exemplo n.º 4
0
 def parse(, response):
     if 302 == response.status:
         print(response.url)
     else:
         douban_id = response.url.split('/')[-2]
         items = json.loads(response.body)['interests']
         for item in items:
             comment = Comment()
             comment['douban_id'] = douban_id
             comment['douban_comment_id'] = item['id']
             comment['douban_user_nickname'] = item['user']['name']
             comment['douban_user_avatar'] = item['user']['avatar']
             comment['douban_user_url'] = item['user']['url']
             comment['content'] = item['comment']
             comment['votes'] = item['vote_count']
             yield comment
Exemplo n.º 5
0
    def parse(self, response):
        response_url = response.url
        if 404 == response.status:
            print("404 Not Found, url:", response_url)
        else:
            douban_id = response_url.split("subject")[1].split("/")[1]

            item_regx = '//div[@class="mod-bd"]/div[@class="comment-item"]'
            comment_item_list = response.xpath(item_regx)

            for comment_item in comment_item_list:
                comment = Comment()
                print("executing respective comment...", comment_item)
                #user_regx = '/div[@class="comment"]//span[@class="comment-info"]'
                username = comment_item.xpath(
                    './/div[@class="comment"]//span[@class="comment-info"]/a/text()'
                ).get()
                rating = comment_item.xpath(
                    './/div[@class="comment"]//span[@class="comment-info"]/span[contains(@class, "allstar")]/@title'
                ).get()
                content = comment_item.xpath(
                    './/div[@class="comment"]/p/span[@class="short"]/text()'
                ).get()
                comment_time = comment_item.xpath(
                    './/div[@class="comment"]//span[@class="comment-info"]/span[contains(@class,"comment-time")]/@title'
                ).get()
                comment_id = comment_item.xpath('.//@data-cid').get()

                comment['douban_id'] = douban_id
                comment['douban_username'] = username
                comment['rating'] = rating
                comment['content'] = content
                comment['comment_time'] = comment_time
                comment['douban_comment_id'] = comment_id
                yield comment

            next_url = response.xpath('//a[@class="next"]/@href').get()
            if next_url:
                url = "https://movie.douban.com/subject/%s/comments%s" % (
                    douban_id, next_url)
                yield scrapy.Request(url, callback=self.parse)
Exemplo n.º 6
0
    def parse(self, response):
        main_url = response.meta['main_url']
        response_url = response.url
        #print("##### main_url:", main_url)
        #print("##### response_url: ", response_url)
        # regx = '//a[preceding-sibling::span[text()="< 前页"]][following-sibling::div]/@href'
        #regx = '//span[@class="prev"]/text()'

        if 404 == response.status:
            print("movie.meta.response.url: ", response.url)
        else:
            douban_id = main_url.split("subject")[1].split("/")[1]

            #下一页
            regx = '//a[@class="next"]/@href'
            next_url = response.xpath(regx).extract()

            #先获取item
            item_regx = '//div[@class="comment-item"]'
            comment_item_list = response.xpath(item_regx).extract()

            #print("comment_item_list======:",comment_item_list)
            if len(comment_item_list) > 1:
                for resp_item in comment_item_list:
                    # print("==============douban_id:", douban_id)
                    print("resp_item======:", resp_item)
                    resp_item = etree.HTML(resp_item)
                    #用户url
                    url_regx = '//div[@class="avatar"]/a/@href'
                    url_list = resp_item.xpath(url_regx)
                    print("\n+++++++++++++++++++++++++url_list", url_list)

                    #用户
                    username_regx = '//div[@class="avatar"]/a/@title'
                    username_list = resp_item.xpath(username_regx)
                    # print("\n+++++++++++++++++++++++++",username_list)

                    #头像路径
                    avator_regx = '//div[@class="avatar"]/a/img/@src'
                    avator_list = resp_item.xpath(avator_regx)
                    #print("\n+++++++++++++++++++++++++",avator_list)

                    #投票数量
                    vote_regx = '//div[@class="comment"]/h3/span/span[@class="votes"]/text()'
                    vote_list = resp_item.xpath(vote_regx)
                    print("\n+++++++++++++++++++++++++vote_list:", vote_list)

                    #评分
                    rating_regx = '//div[@class="comment"]/h3/span[@class="comment-info"]/span[contains(@class,"allstar")]/@class'
                    rating_list = resp_item.xpath(rating_regx)
                    print("\n+++++++++++++++++++++++++rating", rating_list)

                    #评论时间
                    comment_time_regx = '//div[@class="comment"]/h3/span[@class="comment-info"]/span[contains(@class,"comment-time")]/@title'
                    comment_time_list = resp_item.xpath(comment_time_regx)
                    print("\n+++++++++++++++++++++++++comment_time",
                          comment_time_list)

                    # 内容
                    comment_regx = '//div[@class="comment"]/p/span[@class="short"]/text()'
                    comment_list = resp_item.xpath(comment_regx)
                    # print("\n+++++++++++++++++++++++++",comment_list)

                    #评论ID
                    comment_id_regx = '//div[@class="comment"]/h3/span/input/@value'
                    comment_id_list = resp_item.xpath(comment_id_regx)
                    # print("\n+++++++++++++++++++++++++",comment_id_list)

                    # for i in range(len(comment_list)):
                    comment = Comment()
                    comment['douban_id'] = douban_id
                    comment['douban_comment_id'] = comment_id_list[0] if len(
                        comment_id_list) > 0 else ""
                    comment['douban_user_nickname'] = username_list[0] if len(
                        username_list) > 0 else ""
                    comment['douban_user_avatar'] = avator_list[0] if len(
                        avator_list) > 0 else ""
                    comment['douban_user_url'] = url_list[0] if len(
                        url_list) > 0 else ""
                    comment['content'] = comment_list[0] if len(
                        comment_list) > 0 else ""
                    comment['votes'] = vote_list[0] if len(
                        vote_list) > 0 else ""
                    comment['rating'] = rating_list[0] if len(
                        rating_list) > 0 else ""
                    comment['comment_time'] = comment_time_list[0] if len(
                        comment_time_list) > 0 else ""
                    yield comment

            if len(next_url) > 0:
                url = "https://movie.douban.com/subject/%s/comments%s" % (
                    douban_id, next_url[0])
                print("=====request Next url================:", url)
                bid = ''.join(
                    random.choice(string.ascii_letters + string.digits)
                    for x in range(11))
                cookies = {
                    'bid': bid,
                    'dont_redirect': True,
                    'handle_httpstatus_list': [302],
                }
                yield Request(url, cookies=cookies, meta={'main_url': url})