def get_movie_page_main(self, response): main_page_infos = response.xpath("//div[@id='info']").extract() content_infos_sel = Selector(text=main_page_infos[0]) length_infos = content_infos_sel.xpath("//span[@property='v:runtime']/text()").extract() length_infos = length_infos[0] # length_infos = content_infos_sel.xpath("//span[@property='v:runtime']").extract() title = response.meta['title'] rate = response.meta['rate'] casts = response.meta['casts'] directors = response.meta['directors'] id = response.meta['id'] item = DoubanMovieItem() item['title'] = title item['rate'] = rate item['casts'] = casts item['directors'] = directors item['length'] = length_infos item['movie_id'] = id yield item print("movie id: ", id) for rat in range(1, 6): url_review_list = "" url_review_list = self.review_list_1 + id + self.review_list_3 + str(rat) yield Request(url_review_list, callback=self.get_one_review, meta={'rate': rat, "movie_id": id})
def parse(self, response): imgs_path = 'pics' if not os.path.exists(imgs_path): os.makedirs(imgs_path) item = DoubanMovieItem() movies = response.xpath('//ol[@class="grid_view"]/li') for movie in movies: item['ranking'] = movie.xpath( './/div[@class="pic"]/em/text()').extract()[0] movie_name = movie.xpath( './/div[@class="hd"]/a/span[1]/text()').extract()[0] item['movie_name'] = movie_name item['score'] = movie.xpath( './/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] item['score_num'] = movie.xpath( './/div[@class="star"]/span/text()').re(r'(\d+)人评价')[0] item['movie_url'] = movie.xpath( './/div[@class="pic"]/a/@href').extract()[0] movie_image_url = movie.xpath( './/div[@class="pic"]/a/img/@src').extract()[0] urllib.request.urlretrieve(movie_image_url, (imgs_path + '\\%s.jpg') % movie_name) yield item next_url = response.xpath('//span[@class="next"]/a/@href').extract() if next_url: next_url = 'https://movie.douban.com/top250' + next_url[0] yield Request(next_url)
def parse_film(self, response): if response.status == 403: raise CloseSpider("403 forbidden!") item = DoubanMovieItem() item['title'] = response.xpath( "//span[@property='v:itemreviewed']/text()").extract_first() # self.logger.info("----------------title:%s", item['title']) item['directors'] = "/".join( response.xpath("//a[@rel='v:directedBy']/text()").extract()) # self.logger.info("----------------directors:%s", item['directors']) item['adaptors'] = "/".join( response.xpath( "//span[@class='attrs']/a[not(@rel)]/text()").extract()) # self.logger.info("----------------adaptors:%s", item['adaptors']) item['starring'] = "/".join( response.xpath("//a[@rel='v:starring']/text()").extract()) # self.logger.info("----------------starrings:%s", item['starring']) item['genre'] = "/".join( response.xpath("//span[@property='v:genre']/text()").extract()) # self.logger.info("----------------genre:%s", item['genre']) info = response.xpath("//div[@id='info']").extract_first() s = re.search(r'制片国家/地区:</span>(.*)<br>.*<span class="pl">语言:</span>', info, re.M | re.S) if s: item['country'] = s.group(1) # self.logger.info("----------------country:%s", item['country']) item['release_date'] = "/".join( response.xpath( "//span[@property='v:initialReleaseDate']/text()").extract()) # self.logger.info("----------------release_date:%s", item['release_date']) item['runtime'] = response.xpath( "//span[@property='v:runtime']/text()").extract_first() # self.logger.info("----------------runtime:%s", item['runtime']) item['rate'] = response.xpath( "//strong[@property='v:average']/text()").extract_first() # self.logger.info("----------------rate:%s", item['rate']) return item
def parse(self, response): item = DoubanMovieItem() soup = BeautifulSoup(response.text, 'lxml') movies = soup.findAll('div', {'class': 'item'}) n = 0 for movie in movies: item['name'] = movie.find('span', {'class', 'title'}).get_text() item['url'] = movie.find('a')['href'] item['score'] = movie.find('span', {'class', 'rating_num'}).get_text() n = n + 1 yield item print(n) next_url = soup.find('span', {'class', 'next'}).find('a')['href'] print(type(next_url), next_url) if next_url: url = 'https://movie.douban.com/top250' + next_url yield scrapy.Request(url, headers=self.headers)
def parse(self, response): item = DoubanMovieItem() movies = response.xpath('//ol[@class="grid_view"]/li') for movie in movies: item['ranking'] = movie.xpath( './/div[@class="pic"]/em/text()').extract()[0] item['movie_name'] = movie.xpath( './/div[@class="hd"]/a/span[1]/text()').extract()[0] item['score'] = movie.xpath( './/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] item['score_num'] = movie.xpath( './/div[@class="star"]/span/text()').re(r'(\d+)人评价')[0] item['movie_url'] = movie.xpath( './/div[@class="hd"]/a/@href').extract()[0] logger.warning(item) # 打印日志 #logger.debug(item) yield item # next_url = response.xpath('//span[@class="next"]/a/@href').extract() # if next_url: # next_url = 'https://movie.douban.com/top250' + next_url[0] # yield Request(next_url, headers=self.headers)
def parse_movies(self, response): # print(response.xpath('//a[@class="next"]/@href')) _setDNSCache() movie = DoubanMovieItem() movie['movie_id'] = response.url.split('/')[4] # movie['movie_name'] = response.xpath('//div[@id="content"]/h1/span[1]/text()').extract()[0].strip() movie['movie_name'] = response.xpath('//title/text()').get().rstrip().strip('\n').strip()[:-4].rstrip() movie_info = response.xpath('//div[@id="info"]') movie['director'] = movie_info.xpath('span[1]/span[@class="attrs"]/a/text()').getall() movie['author'] = movie_info.xpath('span[2]/span[@class="attrs"]/a/text()').getall() movie['actors'] = movie_info.xpath('span[@class="actor"]/span[@class="attrs"]/a/text()').getall() movie['movie_type'] = movie_info.xpath('span[@property="v:genre"]/text()').getall() movie['official_website'] = movie_info.xpath('span[@class="pl" and text()="官方网站:"]/following-sibling::a/text()').get() movie['region_made'] = movie_info.xpath('span[@class="pl" and text()="制片国家/地区:"]/following-sibling::text()').get() movie['language'] = movie_info.xpath('span[@class="pl" and text()="语言:"]/following-sibling::text()').get() movie['date_published'] = movie_info.xpath('span[@property="v:initialReleaseDate"]/text()').get() movie['movie_length'] = movie_info.xpath('span[@property="v:runtime"]/text()').get() movie['alias'] = movie_info.xpath('span[@class="pl"][6]/following-sibling::text()').get() # movie['votes'] = response.xpath('//div[@class="rating_self clearfix"]/div[@class="rating_right"]/div[@class="rating_sum"]/a/span/text()').get() movie['votes'] = response.xpath('//span[@property="v:votes"]/text()').get() movie['average_rating'] = response.xpath('//div[@class="rating_self clearfix"]/strong/text()').get() movie['stars5_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][1]/span[@class="rating_per"]/text()').get() movie['stars4_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][2]/span[@class="rating_per"]/text()').get() movie['stars3_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][3]/span[@class="rating_per"]/text()').get() movie['stars2_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][4]/span[@class="rating_per"]/text()').get() movie['stars1_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][5]/span[@class="rating_per"]/text()').get() movie['description'] = response.xpath('//span[@property="v:summary"]/text()').getall() movie['recommendations'] = response.xpath('//div[@class="recommendations-bd"]/dl/dd/a/text()').getall() movie['labels'] = response.xpath('//div[@class="tags-body"]/a/text()').getall() movie['collections'] = response.xpath('//div[@class="subject-others-interests-ft"]/a[1]/text()').get() movie['wishes'] = response.xpath('//div[@class="subject-others-interests-ft"]/a[2]/text()').get() yield movie
def parse(self, response): item = DoubanMovieItem() movies = response.xpath('//ol[@class="grid_view"]/li') for movie in movies: item['ranking'] = movie.xpath( './/div[@class="pic"]/em/text()').extract()[0] item['movie_name'] = movie.xpath( './/div[@class="hd"]/a/span[1]/text()').extract()[0] item['score'] = movie.xpath( './/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] #item['score_num'] = movie.xpath('.//div[@class="star"]/span[@property="v:best"]/following-sibling::span/text()').extract()[0] temp = movie.xpath( './/div[@class="star"]/span[@property="v:best"]/following-sibling::span/text()' ).extract()[0] item['score_num'] = re.sub("\D", "", temp) yield item next_url = response.xpath('//span[@class="next"]/a/@href').extract() print("#################") print(next_url) if next_url: next_url = 'https://movie.douban.com/top250' + next_url[0] yield Request(next_url, headers=self.headers)
def parse_item(self, response): item = DoubanMovieItem() item['name'] = response.xpath('//span[@property="v:itemreviewed"]/text()').extract()[0] info = response.xpath('//div[@id="info"]').extract()[0] # staffs = info.xpath('./span/span[@class="attrs"]').extract() pat = re.compile('<.*?>') info = re.sub(pat, '', info).split('\n') info = map(lambda x: x.strip(' '), info) info = filter(lambda x: x != '', info) info = map(lambda x: x.split(':', 1), info) info = filter(lambda x: len(x) == 2, info) info_dict = dict(info) item['director'] = info_dict.get(u'导演', u'未知') item['actors'] = info_dict.get(u'主演', u'未知') item['release_date'] = info_dict.get(u'上映日期', u'未知') item['runtime'] = info_dict.get(u'片长', u'未知') item['score'] = response.xpath('//strong[@property="v:average"]/text()').extract()[0] item['screen_writter'] = info_dict.get(u'编剧', u'未知') item['category'] = info_dict.get(u'类型', u'未知') item['country'] = info_dict.get(u'制片国家/地区', u'未知') item['language'] = info_dict.get(u'语言', u'未知') item['aliases'] = info_dict.get(u'又名', u'未知') item['IMDB'] = info_dict.get(u'IMDb链接', u'未知') return item