def parse_reviews(self, response): _setDNSCache() for item in response.xpath('//div[@class="main review-item"]'): url = item.xpath('div/h2/a/@href').get() yield Request(url=url, callback=self.parse_replies) break # TODO::next page url = urljoin(response.url, response.xpath('//span[@class="next"]/link/@href').get()) if url: yield Request(url=url, callback=self.parse_reviews)
def parse_comments(self, response): _setDNSCache() # crawl comments comment = DoubanCommentItem() # print(response.body) for item in response.xpath('//div[@class="comment-item"]'): # 短评对应的电影id comment['movie_id'] = response.url.split('/')[4] # 短评的唯一id comment['comment_id'] = item.xpath( 'div[@class="comment"]/h3/span[@class="comment-vote"]/input/@value' ).extract()[0].strip() # 多少人评论有用 comment['votes'] = item.xpath( 'div[@class="comment"]/h3/span[@class="comment-vote"]/span/text()' ).extract()[0].strip() # 状态 # comment['user_status'] = item.xpath('div[@class="comment"]/h3/span[@class="comment-info"]/span[1]/text()').extract()[0].strip() # 评分 comment['rating'] = item.xpath( 'div[@class="comment"]/h3/span[@class="comment-info"]/span[2]/@class' ).extract()[0].strip() if comment['rating'] == 'comment-time': comment['rating'] = 'none' # 评论时间 comment['comment_time'] = item.xpath( 'div[@class="comment"]/h3/span[@class="comment-info"]/span[@class="comment-time "]/@title' ).extract() # 评论内容 comment['content'] = item.xpath( 'div[@class="comment"]/p/span[@class="short"]/text()').extract( )[0].strip() # 评论者名字(唯一) comment['user_name'] = item.xpath( 'div[@class="avatar"]/a/@title').extract()[0] # 评论者页面 comment['user_id'] = item.xpath( 'div[@class="avatar"]/a/@href').extract()[0].split('/')[4] yield comment # try next page try: next_page = response.urljoin( response.xpath('//a[@class="next"]/@href').extract()[0]) except: next_page = '' if next_page: # self.pages.append(next_page) yield Request(url=next_page, callback=self.parse_comments)
def parse_review_details(self, response): _setDNSCache() # crawl review details review = DoubanReviewItem() article = response.xpath('//div[@class="article"]') review['review_title'] = article.xpath('h1/span/text()').get() review['review_id'] = response.url.split('/')[4] main = article.xpath('//div[@class="main"]') review['user_name'] = main.xpath( 'header[@class="main-hd"]/a[1]/span/text()').get() review['user_id'] = main.xpath( 'header[@class="main-hd"]/a[1]/@href').get().split('/')[4] review['movie_id'] = main.xpath( 'header[@class="main-hd"]/a[2]/@href').get().split('/')[4] # TODO:: review may not have a rating review['rating'] = main.xpath( 'header[@class="main-hd"]/span[1]/@class').get().split(' ')[0] review['comment_time'] = main.xpath( 'header[@class="main-hd"]/span[@class="main-meta"]/text()').get() review['content'] = [] for c in main.xpath('//div[@id="link-report"]/div/node()'): if c.xpath('text()').get(): review['content'].append(c.xpath('text()').get().strip()) elif c.get().strip() != '<br>' and c.get().strip() != '': review['content'].append(c.get().strip()) # review['content'] = main.xpath('//div[@id="link-report"]/div/node()').getall() review['votes'] = main.xpath( '//div[@class="main-panel-useful"]/button[1]/text()').get().strip( ).split(' ')[1] review['useless_votes'] = main.xpath( '//div[@class="main-panel-useful"]/button[2]/text()').get().strip( ).split(' ')[1] review['forwards'] = main.xpath( '//span[@class="rec-num"]/text()').get() yield review
def parse_movies(self, response): # print(response.xpath('//a[@class="next"]/@href')) _setDNSCache() movie = DoubanMovieItem() movie['movie_id'] = response.url.split('/')[4] # movie['movie_name'] = response.xpath('//div[@id="content"]/h1/span[1]/text()').extract()[0].strip() movie['movie_name'] = response.xpath('//title/text()').get().rstrip().strip('\n').strip()[:-4].rstrip() movie_info = response.xpath('//div[@id="info"]') movie['director'] = movie_info.xpath('span[1]/span[@class="attrs"]/a/text()').getall() movie['author'] = movie_info.xpath('span[2]/span[@class="attrs"]/a/text()').getall() movie['actors'] = movie_info.xpath('span[@class="actor"]/span[@class="attrs"]/a/text()').getall() movie['movie_type'] = movie_info.xpath('span[@property="v:genre"]/text()').getall() movie['official_website'] = movie_info.xpath('span[@class="pl" and text()="官方网站:"]/following-sibling::a/text()').get() movie['region_made'] = movie_info.xpath('span[@class="pl" and text()="制片国家/地区:"]/following-sibling::text()').get() movie['language'] = movie_info.xpath('span[@class="pl" and text()="语言:"]/following-sibling::text()').get() movie['date_published'] = movie_info.xpath('span[@property="v:initialReleaseDate"]/text()').get() movie['movie_length'] = movie_info.xpath('span[@property="v:runtime"]/text()').get() movie['alias'] = movie_info.xpath('span[@class="pl"][6]/following-sibling::text()').get() # movie['votes'] = response.xpath('//div[@class="rating_self clearfix"]/div[@class="rating_right"]/div[@class="rating_sum"]/a/span/text()').get() movie['votes'] = response.xpath('//span[@property="v:votes"]/text()').get() movie['average_rating'] = response.xpath('//div[@class="rating_self clearfix"]/strong/text()').get() movie['stars5_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][1]/span[@class="rating_per"]/text()').get() movie['stars4_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][2]/span[@class="rating_per"]/text()').get() movie['stars3_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][3]/span[@class="rating_per"]/text()').get() movie['stars2_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][4]/span[@class="rating_per"]/text()').get() movie['stars1_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][5]/span[@class="rating_per"]/text()').get() movie['description'] = response.xpath('//span[@property="v:summary"]/text()').getall() movie['recommendations'] = response.xpath('//div[@class="recommendations-bd"]/dl/dd/a/text()').getall() movie['labels'] = response.xpath('//div[@class="tags-body"]/a/text()').getall() movie['collections'] = response.xpath('//div[@class="subject-others-interests-ft"]/a[1]/text()').get() movie['wishes'] = response.xpath('//div[@class="subject-others-interests-ft"]/a[2]/text()').get() yield movie
def parse_replies(self, response): _setDNSCache() self.driver.get(response.url) self.driver.implicitly_wait(5) # crawl replies reply = DoubanReplyItem() try: comments = self.driver.find_elements_by_xpath('//div[@class="item comment-item"]') except: comments = [] for item in comments: #回复对应的电影id reply['movie_id'] = response.url.split('/')[4] #回复的id reply['reply_id'] = item.get_attribute('data-cid') #回复时间 reply['reply_time'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/time').text #回复内容 reply['content'] = [] for c in item.find_elements_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="comment-content"]/span'): if c.text.strip() != '<br>' and c.text.strip != '': reply['content'].append(c.text.strip()) #回复者名字 reply['user_name'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/a').text #回复者id reply['user_id'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/a[1]').get_attribute("href").split('/')[4] #所回复的回复的id(如本条回复直接回复到影评,则没有) reply['reply_to'] = '' yield reply try: reply_list = item.find_element_by_xpath('div[@class="reply-list"]') parent_id = reply['reply_id'] # yield self.parse_reply_list(reply_list, reply["reply_id"]) try: get_more = reply_list.find_element_by_xpath('div[@class="replies-list-control"]/button') get_more.click() self.driver.implicitly_wait(5) except: pass for item in reply_list.find_elements_by_xpath('div[@class="item reply-item"]'): reply['movie_id'] = item.get_attribute('data-target_id') reply['reply_id'] = item.get_attribute('data-cid') reply['reply_time'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/time').text reply['content'] = [] for c in item.find_elements_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="comment-content"]/span'): if c.text.strip() != '<br>' and c.text.strip != '': reply['content'].append(c.text.strip()) reply['user_name'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/a').text reply['user_id'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/a[1]').get_attribute("href").split('/')[4] reply['reply_to'] = parent_id yield reply except: pass break try: next_page = response.urljoin(self.driver.find_element_by_xpath('//a[@class="next"]').get_attribute("href")) except: next_page = "" if next_page: yield Request(url=next_page, callback=self.parse_replies)