Python MetacriticReviewsPipeline примеры использования

Язык программирования: Python

Пространство имен/Пакет: pitchfork.pipelines

Примеров на hotexamples.com: 4

Python MetacriticReviewsPipeline - 4 примера найдено. Это лучшие примеры Python кода для pitchfork.pipelines.MetacriticReviewsPipeline, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MetacriticReviewsPipeline(1)

album_in_database(1)

get_most_recent_review(1)

Пример #1

Показать файл

Файл: metacritic.py Проект: thomcost/pitchfork-scraper

 def __init__(self, all_reviews=False, *args, **kwargs):
     scrapy.Spider.__init__(self, *args, **kwargs)
     self.all_reviews = all_reviews
     self.session = MetacriticReviewsPipeline()
     recent_review = self.session.get_most_recent_review()
     if recent_review:
         self.recent_album = recent_review.album
         self.recent_artist = recent_review.artist
         self.recent_date = recent_review.review_date
     elif not self.all_reviews:
         raise ValueError(
             'Couldn\'t determine where to stop searching for reviews and not collecting all.'
         )

Пример #2

Показать файл

Файл: metacritic.py Проект: dav1do/pitchfork-scraper

 def __init__(self, all_reviews=False, *args, **kwargs):
     scrapy.Spider.__init__(self, *args, **kwargs)
     self.all_reviews = all_reviews
     self.session = MetacriticReviewsPipeline()
     recent_review = self.session.get_most_recent_review()
     if recent_review:
         self.recent_album = recent_review.album
         self.recent_artist = recent_review.artist
         self.recent_date = recent_review.review_date
     elif not self.all_reviews:
         raise ValueError('Couldn\'t determine where to stop searching for reviews and not collecting all.')

Пример #3

Показать файл

Файл: metacritic.py Проект: dav1do/pitchfork-scraper

class MetacriticSpider(scrapy.Spider):
    name = 'metacritic'
    allowed_domains = ["metacritic.com"]
    start_urls = ['http://www.metacritic.com/browse/albums/release-date/available/date?view=condensed']
    custom_settings = {
        'METACRITICREVIEWSPIPELINE_ENABLED': True
    }

    def __init__(self, all_reviews=False, *args, **kwargs):
        scrapy.Spider.__init__(self, *args, **kwargs)
        self.all_reviews = all_reviews
        self.session = MetacriticReviewsPipeline()
        recent_review = self.session.get_most_recent_review()
        if recent_review:
            self.recent_album = recent_review.album
            self.recent_artist = recent_review.artist
            self.recent_date = recent_review.review_date
        elif not self.all_reviews:
            raise ValueError('Couldn\'t determine where to stop searching for reviews and not collecting all.')

    def parse(self, response):
        item = MetacriticItem()
        try:
            href = response.css('span.flipper.next > a::attr(href)').extract()[0]
            next_page = response.urljoin(href)
        except IndexError:  # if there's no Next link, we're hopefully on the last page
            next_page = None
        for album in response.css('#main ol > li > div.product_wrap'):
            try:
                item['album'] = album.xpath('./div[@class="basic_stat product_title"]/a/text()').extract()[0]
                item['album'] = item['album'].strip(' \n')
                item['critic_score'] = album.xpath(
                        './div[@class="basic_stat product_score brief_metascore"]/div/text()').extract()[0]
                stats = album.xpath('./div[@class="basic_stat condensed_stats"]')
                item['artist'] = stats.xpath('.//li[@class="stat product_artist"]/span/text()').extract()[1]
                item['user_score'] = stats.xpath('.//li[@class="stat product_avguserscore"]/span/text()').extract()[1]
                date_string = stats.xpath('.//li[@class="stat release_date full_release_date"]/span/text()')\
                    .extract()[1]
            except IndexError:
                # skip the review info if we don't get all the information
                continue
            try:
                item['review_date'] = strptime(date_string, '%b %d, %Y')
                item['year'] = item['review_date'].tm_year
                item['review_date'] = datetime(*item['review_date'][:6])  # magic unpacking of the struc_time object
            except ValueError:
                pass
            try:
                # critic score must be first since user_score may be 'tbd' if there aren't enough reviews
                item['critic_score'] = float(item['critic_score'])
                item['user_score'] = float(item['user_score'])
            except ValueError:
                del item['user_score']  # if it's 'tbd' put NaN in database
            if self.stop_check(item['artist'], item['album'], item['review_date']):
                logging.info('Stop check passed -- quitting')
                raise StopIteration
            if self.session.album_in_database(item['artist'], item['album'], item['review_date']):
                # TODO implement the update in the pipeline so we can replace 'tbd' values if they're new
                continue
            yield item
        if not next_page:
            raise StopIteration
        yield scrapy.Request(next_page, callback=self.parse)

    def stop_check(self, artist, album, date):
        """
        decide if we should stop searching. might be good to move if self.all_reviews above
        so we can avoid a function call but leaving it here because it encapsulates logic even
        though it's slower.
        @param artist: artist name, will be compared to the splash_artist in the DB
        @param album: album name, will be compared to splash_album in the DB
        @param date: date in datetime, will be compared to review_date in the DB
        @return: True if we should stop, False otherwise
        """
        if self.all_reviews:
            return False
        if artist == self.recent_artist and album == self.recent_album and self.recent_date == date:
            return True
        return False

Пример #4

Показать файл

Файл: metacritic.py Проект: thomcost/pitchfork-scraper

class MetacriticSpider(scrapy.Spider):
    name = 'metacritic'
    allowed_domains = ["metacritic.com"]
    start_urls = [
        'http://www.metacritic.com/browse/albums/release-date/available/date?view=condensed'
    ]
    custom_settings = {'METACRITICREVIEWSPIPELINE_ENABLED': True}

    def __init__(self, all_reviews=False, *args, **kwargs):
        scrapy.Spider.__init__(self, *args, **kwargs)
        self.all_reviews = all_reviews
        self.session = MetacriticReviewsPipeline()
        recent_review = self.session.get_most_recent_review()
        if recent_review:
            self.recent_album = recent_review.album
            self.recent_artist = recent_review.artist
            self.recent_date = recent_review.review_date
        elif not self.all_reviews:
            raise ValueError(
                'Couldn\'t determine where to stop searching for reviews and not collecting all.'
            )

    def parse(self, response):
        item = MetacriticItem()
        try:
            href = response.css(
                'span.flipper.next > a::attr(href)').extract()[0]
            next_page = response.urljoin(href)
        except IndexError:  # if there's no Next link, we're hopefully on the last page
            next_page = None
        for album in response.css('#main ol > li > div.product_wrap'):
            try:
                item['album'] = album.xpath(
                    './div[@class="basic_stat product_title"]/a/text()'
                ).extract()[0]
                item['album'] = item['album'].strip(' \n')
                item['critic_score'] = album.xpath(
                    './div[@class="basic_stat product_score brief_metascore"]/div/text()'
                ).extract()[0]
                stats = album.xpath(
                    './div[@class="basic_stat condensed_stats"]')
                item['artist'] = stats.xpath(
                    './/li[@class="stat product_artist"]/span/text()').extract(
                    )[1]
                item['user_score'] = stats.xpath(
                    './/li[@class="stat product_avguserscore"]/span/text()'
                ).extract()[1]
                date_string = stats.xpath('.//li[@class="stat release_date full_release_date"]/span/text()')\
                    .extract()[1]
            except IndexError:
                # skip the review info if we don't get all the information
                continue
            try:
                item['review_date'] = strptime(date_string, '%b %d, %Y')
                item['year'] = item['review_date'].tm_year
                item['review_date'] = datetime(
                    *item['review_date']
                    [:6])  # magic unpacking of the struc_time object
            except ValueError:
                pass
            try:
                # critic score must be first since user_score may be 'tbd' if there aren't enough reviews
                item['critic_score'] = float(item['critic_score'])
                item['user_score'] = float(item['user_score'])
            except ValueError:
                del item['user_score']  # if it's 'tbd' put NaN in database
            if self.stop_check(item['artist'], item['album'],
                               item['review_date']):
                logging.info('Stop check passed -- quitting')
                raise StopIteration
            if self.session.album_in_database(item['artist'], item['album'],
                                              item['review_date']):
                # TODO implement the update in the pipeline so we can replace 'tbd' values if they're new
                continue
            yield item
        if not next_page:
            raise StopIteration
        yield scrapy.Request(next_page, callback=self.parse)

    def stop_check(self, artist, album, date):
        """
        decide if we should stop searching. might be good to move if self.all_reviews above
        so we can avoid a function call but leaving it here because it encapsulates logic even
        though it's slower.
        @param artist: artist name, will be compared to the splash_artist in the DB
        @param album: album name, will be compared to splash_album in the DB
        @param date: date in datetime, will be compared to review_date in the DB
        @return: True if we should stop, False otherwise
        """
        if self.all_reviews:
            return False
        if artist == self.recent_artist and album == self.recent_album and self.recent_date == date:
            return True
        return False