def __init__(self, *args, **kwargs): super(AmazonReviewsSpider, self).__init__(self, *args, **kwargs) self.asin = kwargs['asin'] # if send_mq argument is presented and not evaluated to False, then a message will be send to # 'load' to fetch the scraped reviews self.send_mq = kwargs.get('send_mq', 0) self.last_review_in_db = get_latest_user_review_date( self.mysql_manager, self.spider_conf['source_id'], self.amazon_kind, self.asin) self.incremental = get_incremental(self.mysql_manager, self.spider_conf['source_id'], self.amazon_kind, self.asin) self.update_incremental_kind = self.project_conf.getboolean( "OUTPUT", "update_incremental_kind")
def parse_reviews(self, response): reviews_xpath = "//li[@itemprop='review']" pros_xpath = ".//li[contains(@class, 'review-pros-and-cons__attribute--pro')]//text()" cons_xpath = ".//li[contains(@class, 'review-pros-and-cons__attribute--con')]//text()" product = response.meta['product'] if not 'last_date_db' in response.meta: bol_id = response.meta['bol_id'] ean = response.meta.get('ean', None) yield product yield bol_id yield ean last_review_in_db = get_latest_user_review_date( self.mysql_manager, self.spider_conf['source_id'], bol_id["ID_kind"], bol_id["ID_value"]) else: last_review_in_db = response.meta['last_date_db'] review_items = get_review_items_from_microdata(self, 'USER', response, product, reviews_xpath, pros_xpath, cons_xpath) if not review_items: return for review in review_items: yield review #incremental scraping date = review['TestDateText'] last_date_in_page = dateparser.parse(date, ["%Y-%m-%d"]) if last_review_in_db > last_date_in_page: return offset = get_query_parameter(response.url, 'offset') if not offset: offset = self.default_offset offset = int(offset) + self.limit next_page_url = set_query_parameter(response.url, 'offset', offset) next_page_url = set_query_parameter(next_page_url, 'limit', self.limit) request = Request(next_page_url, callback=self.parse_reviews) request.meta['use_proxy'] = True request.meta['last_date_db'] = last_review_in_db request.meta['product'] = product yield request
def parse_product(self, response): category = response.meta['category'] productid = None product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] name = self.extract(response.xpath('//h1[@class="product_title"]/text()')) product['PicURL'] = self.extract(response.xpath('//img[@id="mainImage"]/@src')) product['source_internal_id'] = self.extract(response.xpath('//span[@id="product_internet_number"]/text()')) manu = self.extract(response.xpath('//span[@itemprop="brand"]/text()')) mpn = self.extract(response.xpath('//span[@itemprop="model"]/text()')) if manu: product["ProductManufacturer"] = manu name = manu + ' ' + name if mpn: name = manu + ' ' + mpn product['ProductName'] = name yield product if mpn: productid = ProductIdItem() productid['ProductName'] = product["ProductName"] productid['source_internal_id'] = product['source_internal_id'] productid['ID_kind'] = "MPN" productid['ID_value'] = mpn yield productid response.meta['product'] = product if productid: response.meta['product_id'] = productid last_user_review = incremental_utils.get_latest_user_review_date( self.mysql_manager, self.spider_conf['source_id'], productid["ID_kind"], productid['ID_value']) response.meta['last_user_review'] = last_user_review yield Request( 'http://homedepot.ugc.bazaarvoice.com/1999aa/{0}/reviews.djs?format=embeddedhtml&page=1&sort=submissionTime'.format( product["source_internal_id"] ), callback=self.parse_reviews, meta=response.meta, errback=self.errback)
def start_requests(self): query = "select pi.id_value from review.product_id pi " \ "join review.products p on pi.prdid = p.id " \ "join mamboinput.alascore a on p.al_id = a.al_id " \ "where pi.kind = 7 and p.source_id = %s " \ "and TIMESTAMPDIFF(MONTH, p.updatetime,now()) < 4 " \ "order by a.alascore desc, a.rank" self.mysql_manager.execute_select(query, self.spider_conf['source_id']) for asin in self.asins: start_url = self.start_url_format % asin request = Request(url=start_url, callback=self.parse_reviews) request.meta['asin'] = asin request.meta['last_review_in_db'] = get_latest_user_review_date( self.mysql_manager, self.spider_conf['source_id'], self.amazon_kind, asin) yield request
def parse_product(self, response): category = response.meta['category'] product_xpaths = \ { "ProductName": "//*[contains(@class,'pdp-prod-name')]//text()", "PicURL": "//img[@class='primary-image']/@src", "source_internal_id": "//*[@itemprop='productID']/text()" } picurl_alt_xpath = "//img[@class='primary-image']/@data-src" product = self.init_item_by_xpaths(response, "product", product_xpaths) product['OriginalCategoryName'] = category['category_path'] product["ProductManufacturer"] = "Panasonic" product["TestUrl"] = response.url if not product["PicURL"]: product["PicURL"] = self.extract(response.xpath(picurl_alt_xpath)) yield product product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "MPN" product_id['ID_value'] = product["source_internal_id"] yield product_id response.meta['product'] = product response.meta['product_id'] = product_id last_user_review = incremental_utils.get_latest_user_review_date( self.mysql_manager, self.spider_conf['source_id'], product_id["ID_kind"], product_id['ID_value']) response.meta['last_user_review'] = last_user_review response.meta['incremental'] = True yield Request( 'http://panasonic.reviews.bazaarvoice.com/9203-en_us/{0}/reviews.djs?format=embeddedhtml&page=1&sort=submissionTime' .format(product["source_internal_id"]), callback=self.parse_reviews, meta=response.meta)
def parse_product(self, response): product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = self.extract_all( response.xpath('//div[@class="product-breadcrumbs"]//li//text()'), ' > ') product['ProductName'] = self.extract( response.xpath('//h1[@id]/text()')) product['source_internal_id'] = self.extract( response.xpath( '//th[contains(text(),"SKU")]/parent::tr/td/text()')) product['PicURL'] = self.extract( response.xpath('//img[@id="productmain"]/@src')) product["ProductManufacturer"] = self.extract( response.xpath( '//th[contains(text(),"Manufacturer")]/parent::tr/td//text()')) yield product product_id = None mpn_id = self.extract( response.xpath( '//th[contains(text(),"Mfg")]/parent::tr/td/text()')) if mpn_id: mpn = ProductIdItem() mpn['source_internal_id'] = product["source_internal_id"] mpn['ProductName'] = product["ProductName"] mpn['ID_kind'] = "MPN" mpn['ID_value'] = mpn_id product_id = mpn yield mpn upc_id = self.extract( response.xpath('//th[text()="UPC"]/parent::tr/td/text()')) if upc_id: upc = ProductIdItem() upc['source_internal_id'] = product["source_internal_id"] upc['ProductName'] = product["ProductName"] upc['ID_kind'] = "UPC" upc['ID_value'] = upc_id product_id = upc yield upc last_user_review = incremental_utils.get_latest_user_review_date( self.mysql_manager, self.spider_conf['source_id'], product_id["ID_kind"], product_id['ID_value']) reviews = response.xpath('//article[@id]') for review in reviews: dateRaw = self.extract( review.xpath('.//@data-created')).split(' ')[0] dateFormatted = date_format(dateRaw, "%m/%d/%Y") if dateFormatted: dateParsed = dateparser.parse(dateFormatted, date_formats=['%Y-%m-%d']) if dateParsed and last_user_review < dateParsed: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] user_review['TestDateText'] = dateFormatted user_review['SourceTestRating'] = self.extract( review.xpath('./@class')).strip('s') user_review['Author'] = self.extract( review.xpath('.//div/span/em/text()')) user_review['TestTitle'] = self.extract( review.xpath('.//h4/text()')) user_review['TestSummary'] = self.extract_all( review.xpath('.//p[@id]/text()')) user_review['source_internal_id'] = product[ 'source_internal_id'] yield user_review
def parse_reviews(self, response): product = response.meta.get('product', None) product_id = response.meta.get('product_id', None) brand = response.meta.get('brand', None) request_parse_product = response.meta.get('parse_product', None) parse_product = self.parse_BV_product if request_parse_product is not None: parse_product = request_parse_product if parse_product and not self.default_kind: raise Exception( "Parsing product from template but kind not defined") if not product and parse_product: product = self._parse_product(response, brand=brand) product_id = self.product_id(product) product_id["ID_kind"] = self.default_kind product_id['ID_value'] = product['source_internal_id'] response.meta['product'] = product yield product yield product_id next_page_xpath = '(//*[contains(@class,"BVRRNextPage")])[1]/a/@href' last_user_review = response.meta.get('last_user_review', None) incremental = response.meta.get('incremental', True) if not last_user_review: if product_id: last_user_review = incremental_utils.get_latest_user_review_date( self.mysql_manager, self.spider_conf['source_id'], product_id["ID_kind"], product_id['ID_value']) review_list_xpath = '//*[contains(@class,"BVRRContentReview")]' from_another_product_xpath = ".//*[contains(@class,'BVDI_SU BVDI_SUAttribution')]" from_another_source_xpath = ".//*[contains(@class, 'BVRRSyndicatedContentAttribution')]" review_list = response.xpath(review_list_xpath) for idx, review_selector in enumerate(review_list): from_another_source = review_selector.xpath( from_another_source_xpath) from_another_product = review_selector.xpath( from_another_product_xpath) review = self.parse_review(response, review_selector) if not from_another_product and not from_another_source: yield review if last_user_review and incremental: current_user_review = datetime.strptime( review['TestDateText'], '%Y-%m-%d') if current_user_review < last_user_review: return next_page_url = self.extract(response.xpath(next_page_xpath)) next_page_url = get_full_url(response.url, next_page_url) if next_page_url: request = Request(url=next_page_url, callback=self.parse_reviews) request.meta['last_user_review'] = last_user_review request.meta['product'] = product request.meta['product_id'] = product_id yield request