def parse(self, response): douban_id = int(response.meta['douban_id']) div_list = response.xpath('//div[@class="comment-item"]') for div in div_list: content = div.xpath( './/span[@class="short"]/text()').extract_first() if len(content) <= 15: continue douban_user_nickname = div.xpath( './/span[@class="comment-info"]/a/text()').extract_first() avatar_src = div.xpath( './div[@class="avatar"]/a/img/@src').extract_first() douban_user_id = int(re.findall('icon/u(.*?)-', avatar_src)[0]) comment_time = div.xpath('.//span[@class="comment-time "]/text()' ).extract_first().strip() span_class = div.xpath( './/span[@class="comment-info"]/span[2]/@class').extract_first( ) if 'rating' in span_class: star_level = int(re.findall('\d+', span_class)[0][0]) else: star_level = None votes = int( div.xpath('.//span[@class="votes"]/text()').extract_first()) content_md5 = md5(content.encode('utf-8')).hexdigest() comment = Comment() comment['douban_id'] = douban_id comment['douban_user_nickname'] = douban_user_nickname comment['douban_user_id'] = douban_user_id comment['comment_time'] = comment_time comment['star_level'] = star_level comment['votes'] = votes comment['content'] = content comment['content_md5'] = content_md5 yield comment
def parse(self, response): comments = response.xpath( 'id("comments")/ul//li//p/span/text()').getall() for c in comments: comment = Comment() comment['book_id'] = response.url[32:39] comment['comment'] = c yield comment
def parse(self, response): douban_id = response.url.split("/")[-2] items = json.loads(response.body)["interests"] for item in items: comment = Comment() comment["douban_id"] = douban_id comment["douban_comment_id"] = item["id"] comment["douban_user_nickname"] = item["user"]["name"] comment["douban_user_avatar"] = item["user"]["avatar"] comment["douban_user_url"] = item["user"]["url"] comment["content"] = item["comment"] comment["votes"] = item["vote_count"] yield comment
def parse(, response): if 302 == response.status: print(response.url) else: douban_id = response.url.split('/')[-2] items = json.loads(response.body)['interests'] for item in items: comment = Comment() comment['douban_id'] = douban_id comment['douban_comment_id'] = item['id'] comment['douban_user_nickname'] = item['user']['name'] comment['douban_user_avatar'] = item['user']['avatar'] comment['douban_user_url'] = item['user']['url'] comment['content'] = item['comment'] comment['votes'] = item['vote_count'] yield comment
def parse(self, response): response_url = response.url if 404 == response.status: print("404 Not Found, url:", response_url) else: douban_id = response_url.split("subject")[1].split("/")[1] item_regx = '//div[@class="mod-bd"]/div[@class="comment-item"]' comment_item_list = response.xpath(item_regx) for comment_item in comment_item_list: comment = Comment() print("executing respective comment...", comment_item) #user_regx = '/div[@class="comment"]//span[@class="comment-info"]' username = comment_item.xpath( './/div[@class="comment"]//span[@class="comment-info"]/a/text()' ).get() rating = comment_item.xpath( './/div[@class="comment"]//span[@class="comment-info"]/span[contains(@class, "allstar")]/@title' ).get() content = comment_item.xpath( './/div[@class="comment"]/p/span[@class="short"]/text()' ).get() comment_time = comment_item.xpath( './/div[@class="comment"]//span[@class="comment-info"]/span[contains(@class,"comment-time")]/@title' ).get() comment_id = comment_item.xpath('.//@data-cid').get() comment['douban_id'] = douban_id comment['douban_username'] = username comment['rating'] = rating comment['content'] = content comment['comment_time'] = comment_time comment['douban_comment_id'] = comment_id yield comment next_url = response.xpath('//a[@class="next"]/@href').get() if next_url: url = "https://movie.douban.com/subject/%s/comments%s" % ( douban_id, next_url) yield scrapy.Request(url, callback=self.parse)
def parse(self, response): main_url = response.meta['main_url'] response_url = response.url #print("##### main_url:", main_url) #print("##### response_url: ", response_url) # regx = '//a[preceding-sibling::span[text()="< 前页"]][following-sibling::div]/@href' #regx = '//span[@class="prev"]/text()' if 404 == response.status: print("movie.meta.response.url: ", response.url) else: douban_id = main_url.split("subject")[1].split("/")[1] #下一页 regx = '//a[@class="next"]/@href' next_url = response.xpath(regx).extract() #先获取item item_regx = '//div[@class="comment-item"]' comment_item_list = response.xpath(item_regx).extract() #print("comment_item_list======:",comment_item_list) if len(comment_item_list) > 1: for resp_item in comment_item_list: # print("==============douban_id:", douban_id) print("resp_item======:", resp_item) resp_item = etree.HTML(resp_item) #用户url url_regx = '//div[@class="avatar"]/a/@href' url_list = resp_item.xpath(url_regx) print("\n+++++++++++++++++++++++++url_list", url_list) #用户 username_regx = '//div[@class="avatar"]/a/@title' username_list = resp_item.xpath(username_regx) # print("\n+++++++++++++++++++++++++",username_list) #头像路径 avator_regx = '//div[@class="avatar"]/a/img/@src' avator_list = resp_item.xpath(avator_regx) #print("\n+++++++++++++++++++++++++",avator_list) #投票数量 vote_regx = '//div[@class="comment"]/h3/span/span[@class="votes"]/text()' vote_list = resp_item.xpath(vote_regx) print("\n+++++++++++++++++++++++++vote_list:", vote_list) #评分 rating_regx = '//div[@class="comment"]/h3/span[@class="comment-info"]/span[contains(@class,"allstar")]/@class' rating_list = resp_item.xpath(rating_regx) print("\n+++++++++++++++++++++++++rating", rating_list) #评论时间 comment_time_regx = '//div[@class="comment"]/h3/span[@class="comment-info"]/span[contains(@class,"comment-time")]/@title' comment_time_list = resp_item.xpath(comment_time_regx) print("\n+++++++++++++++++++++++++comment_time", comment_time_list) # 内容 comment_regx = '//div[@class="comment"]/p/span[@class="short"]/text()' comment_list = resp_item.xpath(comment_regx) # print("\n+++++++++++++++++++++++++",comment_list) #评论ID comment_id_regx = '//div[@class="comment"]/h3/span/input/@value' comment_id_list = resp_item.xpath(comment_id_regx) # print("\n+++++++++++++++++++++++++",comment_id_list) # for i in range(len(comment_list)): comment = Comment() comment['douban_id'] = douban_id comment['douban_comment_id'] = comment_id_list[0] if len( comment_id_list) > 0 else "" comment['douban_user_nickname'] = username_list[0] if len( username_list) > 0 else "" comment['douban_user_avatar'] = avator_list[0] if len( avator_list) > 0 else "" comment['douban_user_url'] = url_list[0] if len( url_list) > 0 else "" comment['content'] = comment_list[0] if len( comment_list) > 0 else "" comment['votes'] = vote_list[0] if len( vote_list) > 0 else "" comment['rating'] = rating_list[0] if len( rating_list) > 0 else "" comment['comment_time'] = comment_time_list[0] if len( comment_time_list) > 0 else "" yield comment if len(next_url) > 0: url = "https://movie.douban.com/subject/%s/comments%s" % ( douban_id, next_url[0]) print("=====request Next url================:", url) bid = ''.join( random.choice(string.ascii_letters + string.digits) for x in range(11)) cookies = { 'bid': bid, 'dont_redirect': True, 'handle_httpstatus_list': [302], } yield Request(url, cookies=cookies, meta={'main_url': url})