Пример #1
0
    def get_movie_page_main(self, response):
        main_page_infos = response.xpath("//div[@id='info']").extract()
        content_infos_sel = Selector(text=main_page_infos[0])

        length_infos = content_infos_sel.xpath("//span[@property='v:runtime']/text()").extract()
        length_infos = length_infos[0]
        # length_infos = content_infos_sel.xpath("//span[@property='v:runtime']").extract()

        title = response.meta['title']
        rate = response.meta['rate']
        casts = response.meta['casts']
        directors = response.meta['directors']
        id = response.meta['id']

        item = DoubanMovieItem()
        item['title'] = title
        item['rate'] = rate
        item['casts'] = casts
        item['directors'] = directors
        item['length'] = length_infos
        item['movie_id'] = id
        yield item
        print("movie id: ", id)

        for rat in range(1, 6):
            url_review_list = ""
            url_review_list = self.review_list_1 + id + self.review_list_3 + str(rat)
            yield Request(url_review_list, callback=self.get_one_review, meta={'rate': rat, "movie_id": id})
Пример #2
0
    def parse(self, response):
        imgs_path = 'pics'
        if not os.path.exists(imgs_path):
            os.makedirs(imgs_path)
        item = DoubanMovieItem()
        movies = response.xpath('//ol[@class="grid_view"]/li')
        for movie in movies:
            item['ranking'] = movie.xpath(
                './/div[@class="pic"]/em/text()').extract()[0]
            movie_name = movie.xpath(
                './/div[@class="hd"]/a/span[1]/text()').extract()[0]
            item['movie_name'] = movie_name
            item['score'] = movie.xpath(
                './/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()[0]
            item['score_num'] = movie.xpath(
                './/div[@class="star"]/span/text()').re(r'(\d+)人评价')[0]
            item['movie_url'] = movie.xpath(
                './/div[@class="pic"]/a/@href').extract()[0]

            movie_image_url = movie.xpath(
                './/div[@class="pic"]/a/img/@src').extract()[0]
            urllib.request.urlretrieve(movie_image_url,
                                       (imgs_path + '\\%s.jpg') % movie_name)
            yield item

        next_url = response.xpath('//span[@class="next"]/a/@href').extract()
        if next_url:
            next_url = 'https://movie.douban.com/top250' + next_url[0]
            yield Request(next_url)
    def parse_film(self, response):
        if response.status == 403:
            raise CloseSpider("403 forbidden!")

        item = DoubanMovieItem()

        item['title'] = response.xpath(
            "//span[@property='v:itemreviewed']/text()").extract_first()
        # self.logger.info("----------------title:%s", item['title'])

        item['directors'] = "/".join(
            response.xpath("//a[@rel='v:directedBy']/text()").extract())
        # self.logger.info("----------------directors:%s", item['directors'])

        item['adaptors'] = "/".join(
            response.xpath(
                "//span[@class='attrs']/a[not(@rel)]/text()").extract())
        # self.logger.info("----------------adaptors:%s", item['adaptors'])

        item['starring'] = "/".join(
            response.xpath("//a[@rel='v:starring']/text()").extract())
        # self.logger.info("----------------starrings:%s", item['starring'])

        item['genre'] = "/".join(
            response.xpath("//span[@property='v:genre']/text()").extract())
        # self.logger.info("----------------genre:%s", item['genre'])

        info = response.xpath("//div[@id='info']").extract_first()
        s = re.search(r'制片国家/地区:</span>(.*)<br>.*<span class="pl">语言:</span>',
                      info, re.M | re.S)
        if s:
            item['country'] = s.group(1)
            # self.logger.info("----------------country:%s", item['country'])

        item['release_date'] = "/".join(
            response.xpath(
                "//span[@property='v:initialReleaseDate']/text()").extract())
        # self.logger.info("----------------release_date:%s", item['release_date'])

        item['runtime'] = response.xpath(
            "//span[@property='v:runtime']/text()").extract_first()
        # self.logger.info("----------------runtime:%s", item['runtime'])

        item['rate'] = response.xpath(
            "//strong[@property='v:average']/text()").extract_first()
        # self.logger.info("----------------rate:%s", item['rate'])
        return item
Пример #4
0
    def parse(self, response):
        item = DoubanMovieItem()

        soup = BeautifulSoup(response.text, 'lxml')
        movies = soup.findAll('div', {'class': 'item'})

        n = 0
        for movie in movies:
            item['name'] = movie.find('span', {'class', 'title'}).get_text()
            item['url'] = movie.find('a')['href']
            item['score'] = movie.find('span',
                                       {'class', 'rating_num'}).get_text()
            n = n + 1
            yield item

        print(n)
        next_url = soup.find('span', {'class', 'next'}).find('a')['href']
        print(type(next_url), next_url)
        if next_url:
            url = 'https://movie.douban.com/top250' + next_url
            yield scrapy.Request(url, headers=self.headers)
Пример #5
0
    def parse(self, response):
        item = DoubanMovieItem()
        movies = response.xpath('//ol[@class="grid_view"]/li')
        for movie in movies:
            item['ranking'] = movie.xpath(
                './/div[@class="pic"]/em/text()').extract()[0]
            item['movie_name'] = movie.xpath(
                './/div[@class="hd"]/a/span[1]/text()').extract()[0]
            item['score'] = movie.xpath(
                './/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()[0]
            item['score_num'] = movie.xpath(
                './/div[@class="star"]/span/text()').re(r'(\d+)人评价')[0]
            item['movie_url'] = movie.xpath(
                './/div[@class="hd"]/a/@href').extract()[0]
            logger.warning(item)  # 打印日志
            #logger.debug(item)
            yield item

        # next_url = response.xpath('//span[@class="next"]/a/@href').extract()
        # if next_url:
        #     next_url = 'https://movie.douban.com/top250' + next_url[0]
        #     yield Request(next_url, headers=self.headers)
Пример #6
0
    def parse_movies(self, response):
        # print(response.xpath('//a[@class="next"]/@href'))
        _setDNSCache()

        movie = DoubanMovieItem()
        movie['movie_id'] = response.url.split('/')[4]
        # movie['movie_name'] = response.xpath('//div[@id="content"]/h1/span[1]/text()').extract()[0].strip()
        movie['movie_name'] = response.xpath('//title/text()').get().rstrip().strip('\n').strip()[:-4].rstrip()

        movie_info = response.xpath('//div[@id="info"]')
        movie['director'] = movie_info.xpath('span[1]/span[@class="attrs"]/a/text()').getall()
        movie['author'] = movie_info.xpath('span[2]/span[@class="attrs"]/a/text()').getall()
        movie['actors'] = movie_info.xpath('span[@class="actor"]/span[@class="attrs"]/a/text()').getall()
        movie['movie_type'] = movie_info.xpath('span[@property="v:genre"]/text()').getall()
        movie['official_website'] = movie_info.xpath('span[@class="pl" and text()="官方网站:"]/following-sibling::a/text()').get()
        movie['region_made'] = movie_info.xpath('span[@class="pl" and text()="制片国家/地区:"]/following-sibling::text()').get()
        movie['language'] = movie_info.xpath('span[@class="pl" and text()="语言:"]/following-sibling::text()').get()
        movie['date_published'] = movie_info.xpath('span[@property="v:initialReleaseDate"]/text()').get()
        movie['movie_length'] = movie_info.xpath('span[@property="v:runtime"]/text()').get()
        movie['alias'] = movie_info.xpath('span[@class="pl"][6]/following-sibling::text()').get()

        # movie['votes'] = response.xpath('//div[@class="rating_self clearfix"]/div[@class="rating_right"]/div[@class="rating_sum"]/a/span/text()').get()
        movie['votes'] = response.xpath('//span[@property="v:votes"]/text()').get()
        movie['average_rating'] = response.xpath('//div[@class="rating_self clearfix"]/strong/text()').get()
        movie['stars5_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][1]/span[@class="rating_per"]/text()').get()
        movie['stars4_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][2]/span[@class="rating_per"]/text()').get()
        movie['stars3_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][3]/span[@class="rating_per"]/text()').get()
        movie['stars2_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][4]/span[@class="rating_per"]/text()').get()
        movie['stars1_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][5]/span[@class="rating_per"]/text()').get()

        movie['description'] = response.xpath('//span[@property="v:summary"]/text()').getall()
        movie['recommendations'] = response.xpath('//div[@class="recommendations-bd"]/dl/dd/a/text()').getall()
        movie['labels'] = response.xpath('//div[@class="tags-body"]/a/text()').getall()
        movie['collections'] = response.xpath('//div[@class="subject-others-interests-ft"]/a[1]/text()').get()
        movie['wishes'] = response.xpath('//div[@class="subject-others-interests-ft"]/a[2]/text()').get()

        yield movie
Пример #7
0
    def parse(self, response):
        item = DoubanMovieItem()
        movies = response.xpath('//ol[@class="grid_view"]/li')
        for movie in movies:
            item['ranking'] = movie.xpath(
                './/div[@class="pic"]/em/text()').extract()[0]
            item['movie_name'] = movie.xpath(
                './/div[@class="hd"]/a/span[1]/text()').extract()[0]
            item['score'] = movie.xpath(
                './/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()[0]
            #item['score_num'] = movie.xpath('.//div[@class="star"]/span[@property="v:best"]/following-sibling::span/text()').extract()[0]
            temp = movie.xpath(
                './/div[@class="star"]/span[@property="v:best"]/following-sibling::span/text()'
            ).extract()[0]
            item['score_num'] = re.sub("\D", "", temp)
            yield item

        next_url = response.xpath('//span[@class="next"]/a/@href').extract()
        print("#################")
        print(next_url)
        if next_url:
            next_url = 'https://movie.douban.com/top250' + next_url[0]
            yield Request(next_url, headers=self.headers)
Пример #8
0
 def parse_item(self, response):
     item = DoubanMovieItem()
     item['name'] = response.xpath('//span[@property="v:itemreviewed"]/text()').extract()[0]
     info = response.xpath('//div[@id="info"]').extract()[0]
     # staffs = info.xpath('./span/span[@class="attrs"]').extract()
     pat = re.compile('<.*?>')
     info = re.sub(pat, '', info).split('\n')
     info = map(lambda x: x.strip(' '), info)
     info = filter(lambda x: x != '', info)
     info = map(lambda x: x.split(':', 1), info)
     info = filter(lambda x: len(x) == 2, info)
     info_dict = dict(info)
     item['director'] = info_dict.get(u'导演', u'未知')
     item['actors'] = info_dict.get(u'主演', u'未知')
     item['release_date'] = info_dict.get(u'上映日期', u'未知')
     item['runtime'] = info_dict.get(u'片长', u'未知')
     item['score'] = response.xpath('//strong[@property="v:average"]/text()').extract()[0]
     item['screen_writter'] = info_dict.get(u'编剧', u'未知')
     item['category'] = info_dict.get(u'类型', u'未知')
     item['country'] = info_dict.get(u'制片国家/地区', u'未知')
     item['language'] = info_dict.get(u'语言', u'未知')
     item['aliases'] = info_dict.get(u'又名', u'未知')
     item['IMDB'] = info_dict.get(u'IMDb链接', u'未知')
     return item