def process_item(self,item,spider): if isinstance(item,CateItem): Sql.insert_cate_log(item) print('save category: '+ item['title']) pass if isinstance(item,AsinBestItem): Sql.cache_best_asin(item) print('save best seller: '+item['asin']) pass if isinstance(item, ReviewProfileItem): ReviewSql.insert_profile_item(item) return item if isinstance(item, ReviewDetailItem): delay_date = Helper.delay_forty_days() # 40天的截止时间 item_date = Helper.convert_date_str(item['date']) if item_date < delay_date: # 判断是否过了40天限额,如果超出范围 则抛弃此item raise DropItem('the review_id:[%s] has been expired' % item['review_id']) else: item['review_url'] = 'https://www.amazon.com' + item['review_url'] item['date'] = item_date.strftime('%Y-%m-%d') ReviewSql.insert_detail_item(item) return item if isinstance(item, SalesRankingItem): RankingSql.insert_sales_ranking(item) return item if isinstance(item, KeywordRankingItem): RankingSql.insert_keyword_ranking(item) return item pass
def get_detail(self, response): # 获取页面数 page = response.css('ul.a-pagination li a::text') i = 1 # 获取评价总数 total = response.css('.AverageCustomerReviews .totalReviewCount::text' ).extract() # 获取评价总数 now_total = Helper.get_num_split_comma(total[0]) last_review = self.last_review sub_total = int(now_total) - int(last_review) if sub_total != 0: # if sub_total != 0: # 若计算出的页数 不为0 则说明有新的评论,更新profile self.updated = True yield scrapy.Request('https://www.amazon.com/product-reviews/%s' % self.asin, callback=self.profile_parse) if len(page) < 3: # 若找到的a标签总数小于3 说明没有page组件 只有1页数据 yield scrapy.Request(url=response.url + '&pageNumber=1', callback=self.parse) else: if self.daily: page_num = math.ceil(sub_total / 10) print('update item page_num is %s' % page_num) else: self.profile_update_self = True page_num = Helper.get_num_split_comma( page[len(page) - 3].extract()) # 获得总页数 while i <= int(page_num): yield scrapy.Request(url=response.url + '&pageNumber=%s' % i, callback=self.parse) i = i + 1 else: print('there is no item to update')
def start_requests(self): for keyword, poll in self.keyword_pool.items(): yield scrapy.Request( ('https://www.amazon.com/s/?field-keywords=%s&t=' + Helper.random_str(10)) % keyword, self.load_first_page, meta={'items': poll})
def parse(self, response): product_detail = response.xpath('//div/table').re( r'#[0-9,]+(?:.*)in.*\(.*[Ss]ee [Tt]op.*\)') if len(product_detail) == 0: product_detail = response.css('div #SalesRank').re( r'#[0-9,]+(?:.*)in.*\(.*[Ss]ee [Tt]op.*\)') if len(product_detail) != 0: item = SalesRankingItem() key_rank_str = product_detail[0] key_rank_tuple = Helper.get_rank_classify(key_rank_str) item['rank'] = Helper.get_num_split_comma(key_rank_tuple[0]) item['classify'] = key_rank_tuple[1] item['asin'] = response.meta['item']['asin'] yield item else: raise Exception('catch asin[%s] sales ranking error' % response.meta['item']['asin'])
def parse(self, response): reviews = response.css('.review-views .review') for row in reviews: item = ReviewDetailItem() item['asin'] = self.asin item['review_id'] = row.css('div::attr(id)')[0].extract() item['reviewer'] = row.css('.author::text')[0].extract() item['title'] = row.css('.review-title::text')[0].extract() item['review_url'] = row.css( '.review-title::attr(href)')[0].extract() item['date'] = Helper.get_date_split_str( row.css('.review-date::text')[0].extract()) item['star'] = Helper.get_star_split_str( row.css('.review-rating span::text')[0].extract()) content = row.css('.review-data .review-text::text').extract() item['content'] = '<br />'.join( content) if len(content) > 0 else '' yield item
def get_detail(self, response): # get pages page = response.css('ul.a-pagination li a::text') i = 1 # get the amount of reviews total = response.css( '.AverageCustomerReviews .totalReviewCount::text').extract() # extract reviews now_total = Helper.get_num_split_comma(total[0]) last_review = self.last_review sub_total = int(now_total) - int(last_review) if sub_total != 0: # if sub_total != 0: # if the total !=0 ,then indicate theres new reviews,then update profile self.updated = True yield scrapy.Request('https://www.amazon.com/product-reviews/%s' % self.asin, callback=self.profile_parse) if len(page) < 3: #if a < 3 , then there is only 1 page data yield scrapy.Request(url=response.url + '&pageNumber=1', callback=self.parse) else: if self.daily: page_num = math.ceil(sub_total / 10) print('update item page_num is %s' % page_num) else: self.profile_update_self = True page_num = Helper.get_num_split_comma(page[len(page) - 3].extract()) # count total pages while i <= int(page_num): yield scrapy.Request(url=response.url + '&pageNumber=%s' % i, callback=self.parse) i = i + 1 else: print('there is no item to update')
def profile_parse(self, response): item = ReviewProfileItem() item['asin'] = self.asin # average score average = response.css( '.averageStarRatingNumerical a span::text').extract() # exteact average score item['review_rate'] = Helper.get_star_split_str(average[0]) # toal reviews total = response.css( '.AverageCustomerReviews .totalReviewCount::text').extract() item['review_total'] = Helper.get_num_split_comma(total[0]) # product name product = response.css('.product-title h1 a::text').extract() item['product'] = product[0] # product brand item['brand'] = response.css('.product-by-line a::text').extract()[0] item['image'] = response.css( '.product-image img::attr(src)').extract()[0] # product seller item['seller'] = item['brand'] # calculate percentage review_summary = response.css( '.reviewNumericalSummary .histogram ' '#histogramTable tr td:last-child').re(r'\d{1,3}\%') pct = list(map(lambda x: x[0:-1], review_summary)) item['pct_five'] = pct[0] item['pct_four'] = pct[1] item['pct_three'] = pct[2] item['pct_two'] = pct[3] item['pct_one'] = pct[4] yield item
def parse(self, response): item = ReviewProfileItem() item['asin'] = response.meta[ 'asin'] if 'asin' in response.meta else self.asin # 获取平均评价数值 average = response.css( '.averageStarRatingNumerical a span::text').extract() # 获取平均评价值 item['review_rate'] = Helper.get_star_split_str(average[0]) # 获取平均值 # 获取评价总数 total = response.css('.AverageCustomerReviews .totalReviewCount::text' ).extract() # 获取评价总数 item['review_total'] = Helper.get_num_split_comma(total[0]) # 获取产品名称 product = response.css('.product-title h1 a::text').extract() item['product'] = product[0] # 获取产品 brand item['brand'] = response.css('.product-by-line a::text').extract()[0] item['image'] = response.css( '.product-image img::attr(src)').extract()[0] # 获取产品商家 item['seller'] = item['brand'] # 获取各星评价百分比数 review_summary = response.css( '.reviewNumericalSummary .histogram ' '#histogramTable tr td:last-child').re(r'\d{1,3}\%') pct = list(map(lambda x: x[0:-1], review_summary)) item['pct_five'] = pct[0] item['pct_four'] = pct[1] item['pct_three'] = pct[2] item['pct_two'] = pct[3] item['pct_one'] = pct[4] yield item
def fetch_detail_from_review_page(self, response): info = response.css('#cm_cr-product_info')[0].extract() item = DetailItem() item['asin'] = response.meta['asin'] item['image'] = response.css('.product-image img::attr(src)')[0].extract().strip().replace('S60', 'S320') item['title'] = response.css('.product-title >h1>a::text')[0].extract().strip() item['star'] = re.findall("([0-9].[0-9]) out of", info)[0] # 获取评价总数 item['reviews'] = response.css('.AverageCustomerReviews .totalReviewCount::text')[0].extract().strip() item['reviews'] = Helper.get_num_split_comma(item['reviews']) item['seller_price'] = 0 item['amazon_price'] = 0 price = response.css('.arp-price::text')[0].extract().strip().lstrip('$') item['amazon_price'] = price return item
def parse(self, response): result_li = response.xpath('//li[@data-asin]') for item in response.meta['items']: if len(result_li) == 0: self.found[item['id']] = 'none' logging.warning("[keyword none] url: [%s] skwd_id:[%s] asin:[%s] \r\n body: %s" % (response.url, item['id'],item['asin'], response.body)) else: for result in result_li: data_asin = result.xpath('./@data-asin').extract()[0] if data_asin == item['asin']: # print(item) self.found[item['id']] = True # keywordItem = KeywordRankingItem() data_id = result.xpath('./@id').extract()[0] item_id = data_id.split('_')[1] rank = int(item_id) +1 if item['id'] in self.store_poll.keys(): self.store_poll[item['id']].append(rank) else: self.store_poll[item['id']] = [rank] self.store_date[item['id']] = Helper.get_now_date() break
def get_detail(self, response): # 获取评价总数 total = response.css('.AverageCustomerReviews .totalReviewCount::text').extract() # 获取评价总数 now_total = Helper.get_num_split_comma(total[0]) print(now_total)