def start_reviews(self, site_response, product, filter_other_sources=True, sort=None, dir=None, extra_review_parser=None): self.headers['Host'] = self.bv_subdomain self.headers['Referer'] = site_response.url url_params = { 'bv_subdomain': self.bv_subdomain, 'bv_site_locale': self.bv_site_locale, 'source_internal_id': product['source_internal_id'] } review_url = self.FORMAT_URL.format(**url_params) if sort: review_url = set_query_parameter(review_url, 'sort', sort) if sort and dir: review_url = set_query_parameter(review_url, 'dir', dir) request = Request(review_url, callback=self.parse_reviews, headers=self.headers) request.meta['product'] = product request.meta['filter_other_sources'] = filter_other_sources request.meta['extra_review_parser'] = extra_review_parser return request
def parse_cat_filters(self, response): category_name = response.meta['category_name'] base_url = get_base_url(response.url) category = CategoryItem() category["category_path"] = category_name category["category_leaf"] = category_name category["category_url"] = base_url yield category response_json = json.loads(response.body_as_unicode()) specs = response_json['specs'] for spec in specs: if spec['f_name'] == 'Manufacturer': db_name = spec['db_name'] else: continue if 'options' in spec: options = spec['options'] filter_key = 'id' if 'tree' in spec: options = spec['tree'] filter_key = 'key' if not options or not filter_key: raise Exception("Cannot find all manufacturer values in %s" % \ json.dumps(spec)) for option in options: manufacturer = option['title'] filter_value = option[filter_key] products_url = self.products_url[category_name] products_url = set_query_parameter(products_url, '_fil[0][field]', db_name) products_url = set_query_parameter(products_url, '_fil[0][operator]', '=') products_url = set_query_parameter(products_url, '_fil[0][value]', filter_value) _headers = self.get_headers(response.url) request = Request(products_url, self.parse_product, headers=_headers, meta={ 'dont_merge_cookies': True, 'dont_redirect': True }, cookies={}) request.meta['category'] = category request.meta['manufacturer'] = manufacturer yield request return
def parse_reviews(self, response): reviews_xpath = "//li[@itemprop='review']" pros_xpath = ".//li[contains(@class, 'review-pros-and-cons__attribute--pro')]//text()" cons_xpath = ".//li[contains(@class, 'review-pros-and-cons__attribute--con')]//text()" product = response.meta['product'] if not 'last_date_db' in response.meta: bol_id = response.meta['bol_id'] ean = response.meta.get('ean', None) yield product yield bol_id yield ean last_review_in_db = get_latest_user_review_date( self.mysql_manager, self.spider_conf['source_id'], bol_id["ID_kind"], bol_id["ID_value"]) else: last_review_in_db = response.meta['last_date_db'] review_items = get_review_items_from_microdata(self, 'USER', response, product, reviews_xpath, pros_xpath, cons_xpath) if not review_items: return for review in review_items: yield review #incremental scraping date = review['TestDateText'] last_date_in_page = dateparser.parse(date, ["%Y-%m-%d"]) if last_review_in_db > last_date_in_page: return offset = get_query_parameter(response.url, 'offset') if not offset: offset = self.default_offset offset = int(offset) + self.limit next_page_url = set_query_parameter(response.url, 'offset', offset) next_page_url = set_query_parameter(next_page_url, 'limit', self.limit) request = Request(next_page_url, callback=self.parse_reviews) request.meta['use_proxy'] = True request.meta['last_date_db'] = last_review_in_db request.meta['product'] = product yield request
def parse_product(self, response): sii_re = '-([^\-]+).html' product = ProductItem() product['TestUrl'] = response.url.split('#')[0] product['OriginalCategoryName'] = response.meta['category'][ 'category_path'] product['ProductName'] = self.extract(response.xpath('//h1/text()')) product['PicURL'] = self.extract( response.xpath('//a[@itemprop="image"]/@href')) product['ProductManufacturer'] = self.extract( response.xpath('//span[@itemprop="brand"]/a/span/text()')) match = re.search(sii_re, response.url) if not match: return source_internal_id = match.group(1) product['source_internal_id'] = source_internal_id yield product review_xpath = "//ul[@class='pagNum']/@data-action" total_page_xpath = "//ul[@class='pagNum']/li[@class='next']/preceding-sibling::li[1]/text()" review_url = self.extract_xpath(response, review_xpath) total_pages = self.extract_xpath(response, total_page_xpath) if not total_pages: total_pages = 1 latest_db_date = get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf["source_id"], source_internal_id) if review_url: set_query_parameter(review_url, 'ReviewOrdering', '2') review_url = get_full_url(response, review_url) request = Request(url=review_url, callback=self.parse_reviews) request.meta['product'] = product request.meta['current_page'] = 1 if total_pages: request.meta['total_pages'] = total_pages request.meta['latest_db_date'] = latest_db_date yield request
def parse_category(self, response): body = json.loads(response.body_as_unicode()) category_path = response.meta.get('category_path', []) children_categories = body.get('categories', []) if children_categories: for _category in children_categories: _path = deepcopy(category_path) try: if 'name' in _category: _path.append(_category['name']) elif 'Name' in _category: _path.append(_category['Name']) except Exception, e: print _category raise e category_url = set_query_parameter(self.category_url, 'apikey', self.open_api_key) category_url = set_query_parameter(category_url, 'ids', _category['id']) request = Request(category_url, self.parse_category) request.meta['category_path'] = _path yield request
def parse_reviews(self, response): reviews = response.xpath('//div[contains(@class,"detRating")]') product = response.meta['product'] date_xpath = './/div[@class="date"]/@content' rating_xpath = './/div[@class="rat"]/span[1]/text()' title_xpath = './/div[@class="title"]//text()' summary_xpath = './/div[@class="comm"]//text()' date = None for review in reviews: date = self.extract_xpath(review, date_xpath) rating = self.extract_xpath(review, rating_xpath) title = self.extract_xpath(review, title_xpath) summary = self.extract_all_xpath(review, summary_xpath) user_review = ReviewItem.from_product(product=product, tp='USER', date=date, rating=rating, title=title, summary=summary) yield user_review current_page = response.meta['current_page'] total_pages = response.meta['total_pages'] latest_db_date = response.meta['latest_db_date'] if not date: return latest_date_page = dateparser.parse(date, ["%Y-%m-%d"]) if not total_pages: return if current_page == total_pages: return if latest_db_date: if latest_db_date > latest_date_page: return next_page = current_page + 1 next_page_url = set_query_parameter(response.url, 'CurrentPage', next_page) print next_page_url request = Request(url=next_page_url, callback=self.parse_reviews) request.meta['product'] = product request.meta['current_page'] = next_page request.meta['total_pages'] = total_pages request.meta['latest_db_date'] = latest_db_date yield request
def parse(self, response): if 'page' in response.meta: page = response.meta['page'] else: page = 1 review_selectors = response.xpath("//div[@id='stream']//div[contains(@class,'article_box_wrap')]") review_url_xpath = "./a/@href" for review_selector in review_selectors: review_url = self.extract_all(review_selector.xpath(review_url_xpath)) review_url = get_full_url(response, review_url) if review_url: request = Request(review_url, callback=self.parse_review) yield request if self.continue_to_next_page(response): next_page = page+1 next_page_url = set_query_parameter(response.url, 'page', next_page) request.meta['page'] = next_page if next_page_url: request = Request(next_page_url, callback=self.parse) yield request
def parse_products(self, response): category = response.meta['category'] body = json.loads(response.body_as_unicode()) if body["totalResultSize"] == 0: return products = body.get('products', []) for raw_product in products: url = '' pic_url = '' for _url in raw_product['urls']: url = _url['value'] if _url['key'] == "DESKTOP": break for _image in raw_product['images']: pic_url = _image['url'] if _image['key'] == "L": break product_name = raw_product['title'] source_internal_id = raw_product['id'] manufacturer = raw_product.get( 'specsTag', None) # specTags == manufacturer? YES! For reasons... ean_value = raw_product['ean'] product = ProductItem.from_response( response, category=category, product_name=product_name, source_internal_id=source_internal_id, url=url, manufacturer=manufacturer, pic_url=pic_url) bol_id = self.product_id(product, kind='bolcom_id', value=source_internal_id) if ean_value: ean = self.product_id(product, kind='EAN', value=ean_value) #go to review page review_url = self.review_url % source_internal_id request = Request(review_url, callback=self.parse_reviews) request.meta['use_proxy'] = True request.meta['product'] = product request.meta['bol_id'] = bol_id if ean: request.meta['ean'] = ean yield request #go to "next" page offset = get_query_parameter(response.url, 'offset') offset = int(offset) + self.limit if offset > body["totalResultSize"]: return next_page_url = set_query_parameter(response.url, 'offset', offset) request = Request(next_page_url, callback=self.parse_products) request.meta['category'] = category yield request
def start_requests(self): category_url = set_query_parameter(self.category_url, 'apikey', self.open_api_key) request = Request(category_url, self.parse_category) return [request]
self.open_api_key) category_url = set_query_parameter(category_url, 'ids', _category['id']) request = Request(category_url, self.parse_category) request.meta['category_path'] = _path yield request else: category = CategoryItem() category['category_leaf'] = body['originalRequest']['category'][ 'name'] category['category_string'] = body['originalRequest']['category'][ 'id'] category['category_path'] = ' | '.join(category_path) yield category product_url = set_query_parameter(self.product_url, 'apikey', self.open_api_key) product_url = set_query_parameter(product_url, 'ids', category['category_string']) product_url = set_query_parameter(product_url, 'limit', self.limit) product_url = set_query_parameter(product_url, 'offset', 0) if not self.should_skip_category(category): request = Request(product_url, callback=self.parse_products) request.meta['category'] = category yield request def parse_products(self, response): category = response.meta['category'] body = json.loads(response.body_as_unicode()) if body["totalResultSize"] == 0: return
def parse_product(self, response): category = response.meta['category'] items = extruct_helper.get_microdata_extruct_items( response.body_as_unicode()) ean_xpath = '//a[@data-ean]/@data-ean' brand_alt_xpath = "//meta[@property='product:brand']/@content" product = list( extruct_helper.get_products_microdata_extruct( items, response, category)) if len(product) != 1: request = self._retry(response.request) yield request return product_dict = product[0] product = product_dict['product'] if not product['ProductManufacturer']: product['ProductManufacturer'] = self.extract_xpath( response, brand_alt_xpath) yield product for product_id in product_dict['product_ids']: yield product_id ean_value = int(self.extract_xpath(response, ean_xpath)) if ean_value: ean = self.product_id(product, kind='EAN', value=ean_value) yield ean first_page_review_xpath = "//ul[contains(@class, 'js-product-reviews-first')]/@data-href" next_page_review_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-href" reviews_per_page_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-per-page" total_reviews_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-all" initial_index_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-current-index" paging_parameter_xpath = "//ul[contains(@class, 'js-product-reviews-remaining')]/@data-paged-url-param" first_page_review_url = self.extract_xpath(response, first_page_review_xpath) if first_page_review_url: first_page_review_url = get_full_url(response, first_page_review_url) first_page_review_url = set_query_parameter( first_page_review_url, 'sorting', 'LATEST') next_page_review_url = self.extract_xpath(response, next_page_review_xpath) paging_meta = {} if next_page_review_url: last_review_db = get_latest_user_review_date_by_sii( self.mysql_manager, self.spider_conf['source_id'], product['source_internal_id']) next_page_review_url = get_full_url(response, next_page_review_url) next_page_review_url = set_query_parameter( next_page_review_url, 'sorting', 'LATEST') reviews_per_page = self.extract_xpath(response, reviews_per_page_xpath) total_reviews = self.extract_xpath(response, total_reviews_xpath) current_index = self.extract_xpath(response, initial_index_xpath) paging_parameter = self.extract_xpath(response, paging_parameter_xpath) paging_meta = { 'next_page_review_url': next_page_review_url, 'reviews_per_page': int(reviews_per_page), 'total_reviews': int(total_reviews), 'current_index': int(current_index), 'paging_parameter': paging_parameter, 'last_review_db': last_review_db } meta = {'product': product} headers = { 'Referer': response.url, 'X-Requested-With': 'XMLHttpRequest' } meta.update(paging_meta) request = Request(first_page_review_url, meta=meta, headers=headers, callback=self.parse_reviews) yield request
def parse_reviews(self, response): product = response.meta['product'] summary_xpath = ".//article/text()" rating_xpath = ".//meta[@itemprop='rating']/@content" title_xpath = ".//meta[@itemprop='summary']/@content" date_xpath = ".//meta[@itemprop='dtreviewed']/@content" author_xpath = ".//meta[@itemprop='reviewer']/@content" pros_xpath = ".//div[contains(@class, 'review-features') and " \ "contains(@class, 'review-pros')]/text()" cons_xpath = ".//div[contains(@class, 'review-features') and " \ "contains(@class, 'review-cons')]/text()" review_selectors = response.xpath('//li') for review_selector in review_selectors: rating = self.extract_xpath(review_selector, rating_xpath) title = self.extract_xpath(review_selector, title_xpath) date = self.extract_xpath(review_selector, date_xpath) author = self.extract_xpath(review_selector, author_xpath) summary = self.extract_all_xpath(review_selector, summary_xpath) pros = self.extract_all_xpath(review_selector, pros_xpath) cons = self.extract_all_xpath(review_selector, cons_xpath) pros = re.sub("[\s]+", ' ', pros) cons = re.sub("[\s]+", ' ', cons) review = ReviewItem.from_product(product=product, title=title, rating=rating, tp='USER', scale=5, date=date, summary=summary, pros=pros, cons=cons, author=author) yield review last_date_in_page = dateparser.parse(date, ["%Y-%m-%d"]) next_page_url = response.meta.get('next_page_review_url', None) if next_page_url: paging_parameter = response.meta['paging_parameter'] current_index = response.meta['current_index'] reviews_per_page = response.meta['reviews_per_page'] total_reviews = response.meta['total_reviews'] last_review_db = response.meta['last_review_db'] if current_index >= total_reviews: #We reached the end return if last_review_db > last_date_in_page: #reached the end of new data return next_page_url = set_query_parameter(next_page_url, paging_parameter, current_index) headers = { 'Referer': response.request.headers['Referer'], 'X-Requested-With': response.request.headers['X-Requested-With'] } meta = { 'next_page_review_url': next_page_url, 'reviews_per_page': reviews_per_page, 'total_reviews': total_reviews, 'current_index': current_index + reviews_per_page, 'paging_parameter': paging_parameter, 'last_review_db': last_review_db, 'product': product } request = Request(next_page_url, meta=meta, headers=headers, callback=self.parse_reviews) yield request
def parse_product(self, response): # self._check_if_blocked(response) category = response.meta['category'] manufacturer = response.meta['manufacturer'] base_url = get_base_url(response.url) json_response = json.loads(response.body_as_unicode()) data = json_response["data"] image_url_format = "https://s3.graphiq.com/sites/default/files" \ "/%s/media/images/%s" product_url_format = "%s/l/%s/%s" product_name_fields = [ "manufacturer_model", "manfacture_model", "company_product", "mm" ] name_index = "" amazon_asin_index = "" upc_index = "" app_id = data["app_id"] page = data["page"] results = data['recs'] id_index = data["head"].index("id") encoded_title_index = data["head"].index("_encoded_title") for name_field in product_name_fields: try: index = data["head"].index(name_field) name_index = index break except ValueError: continue if not name_index: raise Exception("Could not find product name in %s" % response.url) image_index = data["head"].index("_i_1") if 'amazon_asin' in data["head"]: amazon_asin_index = data["head"].index("amazon_asin") if 'UPC' in data["head"]: upc_index = data["head"].index("UPC") elif 'upc' in data["head"]: upc_index = data["head"].index("upc") for product_data in data["data"]: image_name = "" if len(product_data[image_index]) > 1: image_name = product_data[image_index][0] product = ProductItem() product['OriginalCategoryName'] = category['category_path'] if image_name: product['PicURL'] = image_url_format % (app_id, image_name) product['TestUrl'] = product_url_format % ( base_url, product_data[id_index], product_data[encoded_title_index]) product['ProductManufacturer'] = manufacturer if name_index: product['ProductName'] = product_data[name_index] yield product if upc_index: if product_data[upc_index]: upc = ProductIdItem() upc['ProductName'] = product['ProductName'] upc['ID_kind'] = "UPC" upc['ID_value'] = product_data[upc_index] yield upc if amazon_asin_index: if product_data[amazon_asin_index]: asin = ProductIdItem() asin['ProductName'] = product['ProductName'] asin['ID_kind'] = "ASIN" asin['ID_value'] = product_data[amazon_asin_index] yield asin number_of_pages = int(int(results) / 100) if page < number_of_pages: next_page_url = set_query_parameter(response.url, 'page', page + 1) _headers = self.get_headers(response.url) request = Request(next_page_url, self.parse_product, headers=_headers, cookies={}, meta={ 'dont_merge_cookies': True, 'dont_redirect': True }) request.meta['manufacturer'] = manufacturer request.meta['category'] = category yield request