Пример #1
0
    def process_item(self,item,spider):
        if isinstance(item,CateItem):
            Sql.insert_cate_log(item)
            print('save category: '+ item['title'])
            pass

        if isinstance(item,AsinBestItem):
            Sql.cache_best_asin(item)
            print('save best seller: '+item['asin'])
            pass

        if isinstance(item, ReviewProfileItem):
            ReviewSql.insert_profile_item(item)
            return item

        if isinstance(item, ReviewDetailItem):
            delay_date = Helper.delay_forty_days()  # 40天的截止时间
            item_date = Helper.convert_date_str(item['date'])
            if item_date < delay_date:   # 判断是否过了40天限额,如果超出范围 则抛弃此item
                raise DropItem('the review_id:[%s] has been expired' % item['review_id'])
            else:
                item['review_url'] = 'https://www.amazon.com' + item['review_url']
                item['date'] = item_date.strftime('%Y-%m-%d')
                ReviewSql.insert_detail_item(item)

                return item

        if isinstance(item, SalesRankingItem):
            RankingSql.insert_sales_ranking(item)
            return item

        if isinstance(item, KeywordRankingItem):
            RankingSql.insert_keyword_ranking(item)
            return item
        pass
    def get_detail(self, response):
        # 获取页面数
        page = response.css('ul.a-pagination li a::text')

        i = 1
        # 获取评价总数
        total = response.css('.AverageCustomerReviews .totalReviewCount::text'
                             ).extract()  # 获取评价总数
        now_total = Helper.get_num_split_comma(total[0])
        last_review = self.last_review
        sub_total = int(now_total) - int(last_review)
        if sub_total != 0:
            # if sub_total != 0:  # 若计算出的页数 不为0 则说明有新的评论,更新profile
            self.updated = True
            yield scrapy.Request('https://www.amazon.com/product-reviews/%s' %
                                 self.asin,
                                 callback=self.profile_parse)
            if len(page) < 3:  # 若找到的a标签总数小于3 说明没有page组件 只有1页数据
                yield scrapy.Request(url=response.url + '&pageNumber=1',
                                     callback=self.parse)
            else:
                if self.daily:
                    page_num = math.ceil(sub_total / 10)
                    print('update item page_num is %s' % page_num)
                else:
                    self.profile_update_self = True
                    page_num = Helper.get_num_split_comma(
                        page[len(page) - 3].extract())  # 获得总页数
                while i <= int(page_num):
                    yield scrapy.Request(url=response.url +
                                         '&pageNumber=%s' % i,
                                         callback=self.parse)
                    i = i + 1
        else:
            print('there is no item to update')
 def start_requests(self):
     for keyword, poll in self.keyword_pool.items():
         yield scrapy.Request(
             ('https://www.amazon.com/s/?field-keywords=%s&t=' +
              Helper.random_str(10)) % keyword,
             self.load_first_page,
             meta={'items': poll})
 def parse(self, response):
     product_detail = response.xpath('//div/table').re(
         r'#[0-9,]+(?:.*)in.*\(.*[Ss]ee [Tt]op.*\)')
     if len(product_detail) == 0:
         product_detail = response.css('div #SalesRank').re(
             r'#[0-9,]+(?:.*)in.*\(.*[Ss]ee [Tt]op.*\)')
     if len(product_detail) != 0:
         item = SalesRankingItem()
         key_rank_str = product_detail[0]
         key_rank_tuple = Helper.get_rank_classify(key_rank_str)
         item['rank'] = Helper.get_num_split_comma(key_rank_tuple[0])
         item['classify'] = key_rank_tuple[1]
         item['asin'] = response.meta['item']['asin']
         yield item
     else:
         raise Exception('catch asin[%s] sales ranking error' %
                         response.meta['item']['asin'])
Пример #5
0
 def parse(self, response):
     reviews = response.css('.review-views .review')
     for row in reviews:
         item = ReviewDetailItem()
         item['asin'] = self.asin
         item['review_id'] = row.css('div::attr(id)')[0].extract()
         item['reviewer'] = row.css('.author::text')[0].extract()
         item['title'] = row.css('.review-title::text')[0].extract()
         item['review_url'] = row.css(
             '.review-title::attr(href)')[0].extract()
         item['date'] = Helper.get_date_split_str(
             row.css('.review-date::text')[0].extract())
         item['star'] = Helper.get_star_split_str(
             row.css('.review-rating span::text')[0].extract())
         content = row.css('.review-data .review-text::text').extract()
         item['content'] = '<br />'.join(
             content) if len(content) > 0 else ''
         yield item
Пример #6
0
    def get_detail(self, response):
        # get pages
        page = response.css('ul.a-pagination li a::text')

        i = 1
        # get the amount of reviews
        total = response.css(
            '.AverageCustomerReviews .totalReviewCount::text').extract()
        # extract reviews
        now_total = Helper.get_num_split_comma(total[0])
        last_review = self.last_review
        sub_total = int(now_total) - int(last_review)
        if sub_total != 0:
            # if sub_total != 0:
            # if the total !=0 ,then indicate theres new reviews,then update profile
            self.updated = True
            yield scrapy.Request('https://www.amazon.com/product-reviews/%s' %
                                 self.asin,
                                 callback=self.profile_parse)
            if len(page) < 3:
                #if a < 3 , then there is only 1 page data

                yield scrapy.Request(url=response.url + '&pageNumber=1',
                                     callback=self.parse)
            else:
                if self.daily:
                    page_num = math.ceil(sub_total / 10)
                    print('update item page_num is %s' % page_num)
                else:
                    self.profile_update_self = True
                    page_num = Helper.get_num_split_comma(page[len(page) -
                                                               3].extract())
                    # count total pages
                while i <= int(page_num):
                    yield scrapy.Request(url=response.url +
                                         '&pageNumber=%s' % i,
                                         callback=self.parse)
                    i = i + 1
        else:
            print('there is no item to update')
Пример #7
0
    def profile_parse(self, response):
        item = ReviewProfileItem()

        item['asin'] = self.asin
        # average score
        average = response.css(
            '.averageStarRatingNumerical a span::text').extract()
        # exteact average score

        item['review_rate'] = Helper.get_star_split_str(average[0])
        # toal reviews
        total = response.css(
            '.AverageCustomerReviews .totalReviewCount::text').extract()

        item['review_total'] = Helper.get_num_split_comma(total[0])
        # product name
        product = response.css('.product-title h1 a::text').extract()
        item['product'] = product[0]
        # product  brand
        item['brand'] = response.css('.product-by-line a::text').extract()[0]
        item['image'] = response.css(
            '.product-image img::attr(src)').extract()[0]

        # product seller
        item['seller'] = item['brand']
        # calculate percentage
        review_summary = response.css(
            '.reviewNumericalSummary .histogram '
            '#histogramTable tr td:last-child').re(r'\d{1,3}\%')

        pct = list(map(lambda x: x[0:-1], review_summary))

        item['pct_five'] = pct[0]
        item['pct_four'] = pct[1]
        item['pct_three'] = pct[2]
        item['pct_two'] = pct[3]
        item['pct_one'] = pct[4]

        yield item
Пример #8
0
    def parse(self, response):
        item = ReviewProfileItem()

        item['asin'] = response.meta[
            'asin'] if 'asin' in response.meta else self.asin
        # 获取平均评价数值
        average = response.css(
            '.averageStarRatingNumerical a span::text').extract()  # 获取平均评价值
        item['review_rate'] = Helper.get_star_split_str(average[0])  # 获取平均值
        # 获取评价总数
        total = response.css('.AverageCustomerReviews .totalReviewCount::text'
                             ).extract()  # 获取评价总数
        item['review_total'] = Helper.get_num_split_comma(total[0])
        # 获取产品名称
        product = response.css('.product-title h1 a::text').extract()
        item['product'] = product[0]
        # 获取产品 brand
        item['brand'] = response.css('.product-by-line a::text').extract()[0]
        item['image'] = response.css(
            '.product-image img::attr(src)').extract()[0]

        # 获取产品商家
        item['seller'] = item['brand']
        # 获取各星评价百分比数
        review_summary = response.css(
            '.reviewNumericalSummary .histogram '
            '#histogramTable tr td:last-child').re(r'\d{1,3}\%')

        pct = list(map(lambda x: x[0:-1], review_summary))

        item['pct_five'] = pct[0]
        item['pct_four'] = pct[1]
        item['pct_three'] = pct[2]
        item['pct_two'] = pct[3]
        item['pct_one'] = pct[4]

        yield item
Пример #9
0
    def fetch_detail_from_review_page(self, response):


        info = response.css('#cm_cr-product_info')[0].extract()
        item = DetailItem()
        item['asin'] = response.meta['asin']
        item['image'] = response.css('.product-image img::attr(src)')[0].extract().strip().replace('S60', 'S320')
        item['title'] = response.css('.product-title >h1>a::text')[0].extract().strip()
        item['star'] = re.findall("([0-9].[0-9]) out of", info)[0]

        # 获取评价总数
        item['reviews'] = response.css('.AverageCustomerReviews .totalReviewCount::text')[0].extract().strip()
        item['reviews'] = Helper.get_num_split_comma(item['reviews'])
        item['seller_price'] = 0
        item['amazon_price'] = 0
        price = response.css('.arp-price::text')[0].extract().strip().lstrip('$')
        item['amazon_price'] = price
        return item
Пример #10
0
 def parse(self, response):
     result_li = response.xpath('//li[@data-asin]')
     for item in response.meta['items']:
         if len(result_li) == 0:
             self.found[item['id']] = 'none'
             logging.warning("[keyword none]  url: [%s] skwd_id:[%s] asin:[%s] \r\n body: %s" % (response.url, item['id'],item['asin'], response.body))
         else:
             for result in result_li:
                 data_asin = result.xpath('./@data-asin').extract()[0]
                 if data_asin == item['asin']:
                     # print(item)
                     self.found[item['id']] = True
                     # keywordItem = KeywordRankingItem()
                     data_id = result.xpath('./@id').extract()[0]
                     item_id = data_id.split('_')[1]
                     rank = int(item_id) +1
                     if item['id'] in self.store_poll.keys():
                         self.store_poll[item['id']].append(rank)
                     else:
                         self.store_poll[item['id']] = [rank]
                     self.store_date[item['id']] = Helper.get_now_date()
                     break
Пример #11
0
 def get_detail(self, response):
     # 获取评价总数
     total = response.css('.AverageCustomerReviews .totalReviewCount::text').extract()  # 获取评价总数
     now_total = Helper.get_num_split_comma(total[0])
     print(now_total)