def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) super(OrientaltradingProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs)
def __init__(self, *args, **kwargs): # All this is to set the site_name since we have several # allowed_domains. self.br = BuyerReviewsBazaarApi() super(HomedepotProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs)
def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) self.referer = None self.first_time_products = None self.current_page = 1 self.products_per_page = 20 super(CvsProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs) settings.overrides['CRAWLERA_ENABLED'] = True
def __init__(self, sort_mode=None, *args, **kwargs): from scrapy.conf import settings settings.overrides['DEPTH_PRIORITY'] = 1 settings.overrides[ 'SCHEDULER_DISK_QUEUE'] = 'scrapy.squeue.PickleFifoDiskQueue' settings.overrides[ 'SCHEDULER_MEMORY_QUEUE'] = 'scrapy.squeue.FifoMemoryQueue' self.quantity = kwargs.get('quantity', 1000) # default is 1000 self.br = BuyerReviewsBazaarApi(called_class=self) super(DellProductSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs)
def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) self.index = 1 self.error_pagin = 0 self.pages_pagin = [] self.count_pagin_page = 0 self.count_pagin_links = 0 super(NeweggProductSpider, self).__init__(*args, **kwargs)
def __init__(self, search_sort='recommended', *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) super(HalfordsProductSpider, self).__init__( site_name=self.allowed_domains[0], url_formatter=FormatterWithDefaults( sort=self._SORT_MODES[search_sort] ), *args, **kwargs)
def __init__(self, search_sort='NEWEST', *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) super(HouseoffraserProductSpider, self).__init__(site_name=self.allowed_domains[0], url_formatter=FormatterWithDefaults( sort_mode=self._SORT_MODES[search_sort]), *args, **kwargs)
def __init__(self, sort_mode=None, *args, **kwargs): if sort_mode not in self.SORT_MODES: sort_mode = 'default' self.SORT = self.SORT_MODES[sort_mode] self.pages = dict() self.br = BuyerReviewsBazaarApi(called_class=self) super(HomebaseProductSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs)
def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) # officedepot seems to have a bot protection, so we first get the cookies # and parse the site with them after that self.proxy = None self.timeout = 60 self.width = 1024 self.height = 768 self.selenium_cookies = {} self.user_agent = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36') socket.setdefaulttimeout(60) self._get_selenium_cookies_for_main_page() if kwargs.get('scrape_variants_with_extra_requests'): self._extra_requests = True super(OfficedepotProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs)
def __init__(self, sort_mode=None, *args, **kwargs): from scrapy.conf import settings settings.overrides['DEPTH_PRIORITY'] = 1 settings.overrides[ 'SCHEDULER_DISK_QUEUE'] = 'scrapy.squeue.PickleFifoDiskQueue' settings.overrides[ 'SCHEDULER_MEMORY_QUEUE'] = 'scrapy.squeue.FifoMemoryQueue' settings.overrides['CRAWLERA_ENABLED'] = True self.quantity = kwargs.get('quantity', 1000) # default is 1000 self.proxy = 'content.crawlera.com:8010' self.proxy_type = 'http' #self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0' self.user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A' self.br = BuyerReviewsBazaarApi(called_class=self) super(NikeProductSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs)
def __init__(self, sort_mode=None, *args, **kwargs): self.buyer_reviews = BuyerReviewsBazaarApi(called_class=self) if sort_mode: if sort_mode.lower() not in self.SORT_MODES: self.log('"%s" not in SORT_MODES') else: self.SORTING = self.SORT_MODES[sort_mode.lower()] super(JcpenneyProductsSpider, self).__init__(url_formatter=FormatterWithDefaults( sort_mode=self.SORTING or self.SORT_MODES['default']), site_name=self.allowed_domains[0], *args, **kwargs) settings.overrides['CONCURRENT_REQUESTS'] = 1 self.user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
class DockersProductsSpider(BaseValidator, BaseProductsSpider): name = 'dockers_products' allowed_domains = ["dockers.com", "www.dockers.com"] start_urls = [] settings = DockersValidatorSettings SEARCH_URL = "http://www.dockers.com/US/en_US/search?Ntt={search_term}" # TODO: ordering PAGINATE_URL = ('http://www.dockers.com/US/en_US/includes/searchResultsScroll/?nao={nao}' '&url=%2FUS%2Fen_US%2Fsearch%3FD%3D{search_term}%26Dx' '%3Dmode%2Bmatchall%26N%3D4294961104%2B4294961101%2B4294965619%26Ntk' '%3DAll%26Ntt%3D{search_term}%26Ntx%3Dmode%2Bmatchall') CURRENT_NAO = 0 PAGINATE_BY = 12 # 12 products TOTAL_MATCHES = None # for pagination total_matches = None REVIEW_URL = "http://dockers.ugc.bazaarvoice.com/2080-en_us/{product_id}" \ "/reviews.djs?format=embeddedhtml&page={index}&" RELATED_PRODUCT = "https://levis.tt.omtrdc.net/m2/levis/mbox/ajax?" \ "mboxHost=www.dockers.com" \ "&mboxSession=1481449902450-970396" \ "&mboxCount=1" \ "&entity.id={product_id}" \ "&entity.categoryId={product_categories}" \ "&mbox=target-global-mbox" \ "&mboxId=0" \ "&mboxURL={product_url}" \ "&mboxReferrer=http://www.dockers.com/" \ "&mboxVersion=60" use_proxies = True handle_httpstatus_list = [404] def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) super(DockersProductsSpider, self).__init__( site_name=self.allowed_domains[0], *args, **kwargs) def _parse_single_product(self, response): return self.parse_product(response) def _init_firefox(self, proxy): from selenium import webdriver from selenium.webdriver.remote.remote_connection import RemoteConnection RemoteConnection.set_timeout(30) profile = webdriver.FirefoxProfile() #profile.set_preference("general.useragent.override", self.user_agent) profile.set_preference('intl.accept_languages', 'en-US') profile.set_preference("network.proxy.type", 1) # manual proxy configuration if proxy: # we assume only http proxies are accepted, format: http://host:port proxy, port = proxy.replace('http://', '').split(':') profile.set_preference("network.proxy.http", proxy) profile.set_preference("network.proxy.http_port", int(port)) profile.update_preferences() driver = webdriver.Firefox(profile) driver.set_window_size(1280, 1024) driver.set_page_load_timeout(60) driver.set_script_timeout(60) return driver def _is_product_page(self, response): return 'is_product_page' in response.meta def _get_product_links_from_serp(self, driver): result = [] for l in driver.find_elements_by_xpath( '//li[contains(@class, "product-tile")]' '//a[contains(@rel, "product")]' ): href = l.get_attribute('href') if href: if not href.startswith('http'): href = urlparse.urljoin('http://' + self.allowed_domains[0], href) result.append(href) return result @staticmethod def last_six_digits_the_same(lst): print lst if len(lst) < 7: return return len(set(lst[-6:-1])) == 1 # if all elements are the same, set's length will be 1 def parse(self, response): proxy = response.request.meta.get('proxy', None) if not self._is_product_page(response): self.total_matches = self._scrape_total_matches(response) display = Display(visible=0, size=(1280, 1024)) display.start() product_links = [] # scrape "quantity" products driver = self._init_firefox(proxy=proxy) try: driver.get('http://www.dockers.com/US/en_US/') except Exception as e: print(str(e)) self.log(str(e)) driver.find_element_by_name('Ntt').send_keys(self.searchterms[0] + '\n') time.sleep(10) # let AJAX finish new_meta = response.meta.copy() # get all products we need (scroll down) collected_products_len = [] num_of_errors = 0 while True: try: driver.execute_script("scrollTo(0,50000)") time.sleep(10) product_links = self._get_product_links_from_serp(driver) collected_products_len.append(len(product_links)) if self.last_six_digits_the_same(collected_products_len): break # last six iterations collected equal num of products if len(product_links) > self.quantity: break print 'Collected %i product links' % len(product_links) self.log('Collected %i product links' % len(product_links)) self.log('Statistics: %s' % report_statistics()) except Exception as e: print str(e) self.log('Error while doing pagination %s' % str(e), WARNING) num_of_errors += 1 if num_of_errors > 10: self.log('Too many webdriver errors', ERROR) driver.quit() display.stop() return #driver.save_screenshot('/tmp/1.png') new_meta['is_product_page'] = True for i, product_link in enumerate(product_links): new_meta['_ranking'] = i+1 yield Request(product_link, meta=new_meta, callback=self.parse_product) driver.quit() display.stop() def parse_product(self, response): meta = response.meta.copy() product = meta.get('product', SiteProductItem()) if response.status == 404 or "www.dockers.com/US/en_US/error" in response.url: product.update({"not_found": True}) product.update({"no_longer_available": True}) product.update({"locale": 'en-US'}) return product else: product.update({"no_longer_available": False}) reqs = [] meta['reqs'] = reqs product['ranking'] = response.meta.get('_ranking', None) product['total_matches'] = self.total_matches product['url'] = response.url product['site'] = self.allowed_domains[0] product['search_term'] = self.searchterms[0] if self.searchterms else None product['scraped_results_per_page'] = product['results_per_page'] = self.PAGINATE_BY # product id self.product_id = is_empty(response.xpath('//meta[@itemprop="model"]/@content').extract()) # product data in json self.js_data = self.parse_data(response) # Parse locate locale = 'en_US' cond_set_value(product, 'locale', locale) # Parse model cond_set_value(product, 'model', self.product_id) reseller_id_regex = "p\/([^\/&?\.\s]+)" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id) # Parse title title = self.parse_title(response) cond_set(product, 'title', title) # Parse image image = self.parse_image(response) cond_set_value(product, 'image_url', image) # Parse brand brand = self.parse_brand(response) cond_set_value(product, 'brand', brand) # Parse upc upc = self.parse_upc(response) cond_set_value(product, 'upc', upc) # Parse sku sku = self.parse_sku(response) cond_set_value(product, 'sku', sku) # Parse description description = self.parse_description(response) cond_set_value(product, 'description', description) # Parse price price = self.parse_price(response) cond_set_value(product, 'price', price) # Parse variants variants = self._parse_variants(response) product['variants'] = variants # Parse product_categories self.product_categories = self._extract_categories(response.body_as_unicode()) response.meta['marks'] = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0} real_count = is_empty(re.findall(r'<span itemprop="reviewCount">(\d+)<\/span>', response.body_as_unicode())) response.meta['product'] = product meta = response.meta if real_count: # Parse buyer reviews if int(real_count) > 8: for index, i in enumerate(xrange(9, int(real_count) + 1, 30)): reqs.append( Request( url=self.REVIEW_URL.format(product_id=self.product_id, index=index+2), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta ) ) reqs.append( Request( url=self.REVIEW_URL.format(product_id=self.product_id, index=0), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta )) if reqs: return self.send_next_request(reqs, response) return product def _parse_variants(self, response): """ Parses product variants. """ dk = DockersVariants() dk.setupSC(response) variants = dk._variants() return variants def parse_buyer_reviews(self, response): meta = response.meta.copy() buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response) for k, v in buyer_reviews_per_page['rating_by_star'].iteritems(): response.meta['marks'][k] += v product = response.meta['product'] reqs = meta.get('reqs', []) product['buyer_reviews'] = BuyerReviews( num_of_reviews=buyer_reviews_per_page['num_of_reviews'], average_rating=buyer_reviews_per_page['average_rating'], rating_by_star=response.meta['marks'] ) # Updated related product url, previous res-x doesn't work product_id = self.product_id + 'US' url = self.RELATED_PRODUCT.format(product_id=product_id, product_categories=self.product_categories, product_url=product.get('url')) reqs.append( Request( url=url, dont_filter=True, callback=self.parse_related_product, meta=meta )) return self.send_next_request(reqs, response) def send_next_request(self, reqs, response): """ Helps to handle several requests """ req = reqs.pop(0) new_meta = response.meta.copy() if reqs: new_meta["reqs"] = reqs return req.replace(meta=new_meta) def parse_brand(self, response): brand = is_empty(response.xpath( '//meta[@itemprop="brand"]/@content').extract()) return brand def parse_title(self, response): title = response.xpath( '//meta[contains(@property,"og:title")]/@content').extract() if not title: title = response.xpath( '//meta[contains(@name,"og:title")]/@content').extract() return title def parse_data(self, response): data = re.findall(r'var buyStackJSON = \'(.+)\'; ', response.body_as_unicode()) if data: data = re.sub(r'\\(.)', r'\g<1>', data[0]) try: js_data = json.loads(data) except: return return js_data def parse_image(self, response): if self.js_data: try: image = self.js_data['colorid'][self.product_id]['gridUrl'] except: return return image def parse_related_product(self, response): product = response.meta['product'] text = self._extract_related_products_json(response.body_as_unicode()) related_products = self._build_related_products_array(text, product) if related_products: product['related_products'] = {} product['related_products']['buyers_also_bought'] = related_products return product def parse_description(self, response): if self.js_data: try: description = self.js_data['colorid'][self.product_id]['name'] except: return return description def parse_upc(self, response): if self.js_data: for v in self.js_data['sku'].values(): upc = v['upc'] upc = upc[-12:] if len(upc) < 12: count = 12-len(upc) upc = '0'*count+upc return upc def parse_sku(self, response): if self.js_data: for v in self.js_data['sku'].values(): skuid = v['skuid'] return skuid def parse_price(self, response): if self.js_data: price = self.js_data['colorid'][self.product_id]['price'] for price_data in price: if price_data['il8n'] == 'now': price = price_data['amount'] currency = is_empty(re.findall(r'currency":"(\w+)"', response.body_as_unicode())) if price and currency: price = Price(price=price, priceCurrency=currency) else: price = Price(price=0.00, priceCurrency="USD") return price def _scrape_total_matches(self, response): totals = response.css('.productCount ::text').extract() if totals: totals = totals[0].replace(',', '').replace('.', '').strip() if totals.isdigit(): if not self.TOTAL_MATCHES: self.TOTAL_MATCHES = int(totals) return int(totals) def _scrape_product_links(self, response): for link in response.xpath( '//li[contains(@class, "product-tile")]' '//a[contains(@rel, "product")]/@href' ).extract(): yield link, SiteProductItem() def _get_nao(self, url): nao = re.search(r'nao=(\d+)', url) if not nao: return return int(nao.group(1)) def _replace_nao(self, url, new_nao): current_nao = self._get_nao(url) if current_nao: return re.sub(r'nao=\d+', 'nao='+str(new_nao), url) else: return url+'&nao='+str(new_nao) def _scrape_next_results_page_link(self, response): if self.TOTAL_MATCHES is None: self.log('No "next result page" link!') return if self.CURRENT_NAO > self.TOTAL_MATCHES+self.PAGINATE_BY: return # it's over self.CURRENT_NAO += self.PAGINATE_BY return Request( self.PAGINATE_URL.format( search_term=response.meta['search_term'], nao=str(self.CURRENT_NAO)), callback=self.parse, meta=response.meta ) @staticmethod def _extract_categories(body): pattern = re.compile('var\s+categoryIds\s*=\s*\'(.+?)\;') categories = pattern.search(body) return categories.group(1) if categories else None def _extract_related_products_json(self, body): pattern = re.compile('\_AT\.applyWhenReady\(\s*\[\s*({.+?})\s*\]\s*\)\s*;', re.DOTALL) related_products_json = pattern.search(body) data = related_products_json.group(1) if related_products_json else None try: data = json.loads(data).get('content') return data except Exception as e: self.log('{}'.format(e.message)) return None @staticmethod def _build_related_products_array(text, product): s = Selector(text=text) related_products = [] product_url = product.get('url') for element in s.xpath('//li[contains(@class, "imagegrid")]'): url = element.xpath('.//a/@href').extract() title = element.xpath('.//p[@class="name"]/text()').extract() if url and title: url = urlparse.urljoin(product_url, url[0]) title = title[0] related_products.append(RelatedProduct(url=url, title=title)) return related_products
class DellProductSpider(BaseProductsSpider): name = 'dell_products' allowed_domains = ["dell.com", "recs.richrelevance.com"] handle_httpstatus_list = [404, 403, 502, 520] SEARCH_URL = "http://pilot.search.dell.com/{search_term}" REVIEW_URL = "http://reviews.dell.com/2341_mg/{product_id}/reviews.htm?format=embedded" VARIANTS_URL = "http://www.dell.com/api/configService.svc/postmoduleoverrides/json" VARIANTS_DATA = { 'c': 'us', 'l': 'en', 's': 'dhs', 'cs': '19', 'moduleTemplate': 'products/ProductDetails/mag/config_popup_mag', 'modErrorTemplate': 'products/module_option_validation', 'resultType': 'SingleModule', 'productCode': 'undefined' } # there are two types of product pages, each of them requires different related products processing RELATED_PROD_URL_V1 = ( "http://recs.richrelevance.com/rrserver/p13n_generated.js?" "pt=|item_page.mag_syspdpoc1|item_page.mag_syspdpoc2|item_page.mag_syspdpoc3|item_page.mag_syspdpoc4|item_page.mag_syspdpoc5&" "a=usdhsa5d5af7012d61fd1&rid=us_19_en_dhs&sgs=|us_19_en_dhs:us_19_en_dhs&flv=15.0.0&" "s=undefined{date}&n={n}&chi={chi}&ts={ts}&p={p}") RELATED_PROD_URL_V2 = ( "http://recs.richrelevance.com/rrserver/p13n_generated.js?" "pt=|item_page.storm_snp_pdp1|item_page.storm_snp_pdp2|item_page.storm_snp_pdp3|item_page.storm_snp_pdp4|item_page.storm_snp_pdp5&" "sgs=|us_04_en_bsd:us_04_en_bsd&rid=us_04_en_bsd&flv=11.2.999&l=1&" "u=ykOA15fokzi417dpJeveUF65A0NwWJeGhQ6pvWEfbCuYOurQKpNgzVVXCdsYKqf4&" "s=ykOA15fokzi417dpJeveUF65A0NwWJeGhQ6pvWEfbCuYOurQKpNgzVVXCdsYKqf4{date}&" "a=usbsda5d5af7012d61fd1&ts={ts}&p={p}") def __init__(self, sort_mode=None, *args, **kwargs): from scrapy.conf import settings settings.overrides['DEPTH_PRIORITY'] = 1 settings.overrides[ 'SCHEDULER_DISK_QUEUE'] = 'scrapy.squeue.PickleFifoDiskQueue' settings.overrides[ 'SCHEDULER_MEMORY_QUEUE'] = 'scrapy.squeue.FifoMemoryQueue' self.quantity = kwargs.get('quantity', 1000) # default is 1000 self.br = BuyerReviewsBazaarApi(called_class=self) super(DellProductSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs) def start_requests(self): for st in self.searchterms: yield Request( self.url_formatter.format( self.SEARCH_URL, search_term=urllib.quote(st.encode('utf-8')), ), meta={ 'search_term': st, 'remaining': self.quantity }, ) if self.product_url: prod = SiteProductItem() prod['is_single_result'] = True yield Request(self.product_url, self._parse_single_product, meta={'product': prod}) def _parse_single_product(self, response): return self.parse_product(response) def _get_product_links_from_serp(self, driver): results = [] links = driver.find_elements_by_xpath( '//h4/../../a[contains(@href, "/")]') for l in links: href = l.get_attribute('href') if href: if not href.startswith('http'): href = urlparse.urljoin( 'http://' + self.allowed_domains[0], href) results.append(href) return results def _is_product_page(self, response): return 'is_product_page' in response.meta def _init_webdriver(self): from selenium import webdriver from selenium.webdriver.remote.remote_connection import RemoteConnection RemoteConnection.set_timeout(30) driver = webdriver.Firefox() driver.set_window_size(1280, 1024) driver.set_page_load_timeout(60) driver.set_script_timeout(60) return driver def parse(self, response): if not self._is_product_page(response): product_links = [] # scrape "quantity" products display = Display(visible=0, size=(1280, 1024)) display.start() driver = self._init_webdriver() driver.get(response.url) time.sleep(6) # let AJAX finish new_meta = response.meta.copy() # get all products we need (or till the "show more products" button exists) paging_button = '//button[contains(@id, "paging-button")]' num_of_errors = 0 while driver.find_elements_by_xpath(paging_button): try: button = driver.find_elements_by_xpath(paging_button) button[0].click() time.sleep(4) product_links = self._get_product_links_from_serp(driver) if len(product_links) > self.quantity: break print 'Collected %i product links' % len(product_links) self.log('Collected %i product links' % len(product_links)) except Exception as e: print str(e) self.log('Error while doing pagination: %s' % str(e), WARNING) num_of_errors += 1 if num_of_errors > 10: self.log('Too many webdriver errors', ERROR) driver.quit() display.stop() return #driver.save_screenshot('/tmp/1.png') new_meta['is_product_page'] = True for i, product_link in enumerate(product_links): new_meta['_ranking'] = i + 1 yield Request(product_link, meta=new_meta, callback=self.parse_product) driver.quit() try: display.stop() except Exception as e: self.log('Exception on display.stop(): [%s]' % str(e)) @staticmethod def _parse_price(response): dell_price = response.xpath('//*[contains(text(), "Dell Price")]') dell_price = re.search( '\$([\d,]+\.\d+)', ''.join(dell_price.xpath('./..//text()').extract())) if dell_price: dell_price = dell_price.group(1) price = Price(price=dell_price, priceCurrency='USD') return price price = response.xpath('//*[contains(@name, "pricing_sale_price")]' '[contains(text(), "$")]//text()').extract() if not price: price = response.xpath( '//*[contains(@name, "pricing_retail_price")]' '[contains(text(), "$")]//text()').extract() if price: price = Price(price=price[0].strip().replace('$', ''), priceCurrency='USD') return price @staticmethod def _parse_image(response): img_src = response.xpath( '//*[contains(@id, "product_main_image")]' '//img[contains(@src, ".jp")]/@src').extract() if not img_src: img_src = response.xpath( '//*[contains(@class, "oneImageUp")]' '//img[contains(@src, ".jp")]/@src').extract() if not img_src: img_src = response.xpath( '//*[contains(@class, "leftRightMainImg")]' '//img[contains(@src, ".jp")]/@src').extract() if not img_src: img_src = response.xpath( '//*[contains(@class, "oneImageUp")]' '//img[contains(@data-original, ".jp")]/@data-original' ).extract() if img_src: return img_src[0] @staticmethod def _parse_brand(response, prod_title): # <meta itemprop="brand" content = "DELL"/> brand = response.xpath( '//meta[contains(@itermprop, "brand")]/@content').extract() if not brand: brand = response.xpath( '//a[contains(@href, "/brand.aspx")]/img/@alt').extract() if brand: return brand[0].title() if prod_title: brand = guess_brand_from_first_words(prod_title) if not brand: prod_title = prod_title.replace('New ', '').strip() brand = guess_brand_from_first_words(prod_title) if brand: return brand @staticmethod def _parse_description(response): desc = response.xpath('//*[@id="cntTabsCnt"]').extract() if not desc: desc = response.xpath( './/*[@id="AnchorZone3"]' '//div[not(contains(@class, "anchored_returntotop"))]' ).extract() if desc: return desc[0] def _related_products(self, response): results = [] rps = response.xpath( '//*[contains(@class, "psItemDescription")]//' 'div[contains(@class, "psTeaser")]//a[contains(@href, "productdetail.aspx")]' ) for rp in rps: results.append( RelatedProduct( rp.xpath('text()').extract()[0].strip(), rp.xpath('@href').extract() [0].strip())) # TODO: check if it's a valid format # TODO: scrape dynamic related products return results def parse_buyer_reviews(self, response): product = response.meta['product'] buyer_reviews = self.br.parse_buyer_reviews_per_page(response) product['buyer_reviews'] = buyer_reviews yield product def _get_stock_status(self, response, product): oos_element = response.xpath( '//a[contains(@class, "smallBlueBodyText")]' '[contains(@href, "makeWin")]//text()').extract() if oos_element: oos_element = oos_element[0].lower() if ('temporarily out of stock' in oos_element or 'pre-order' in oos_element): product['is_out_of_stock'] = True return product if 'limited supply available' in oos_element: product['is_out_of_stock'] = False product['limited_stock'] = LimitedStock(is_limited=True, items_left=-1) return product @staticmethod def _get_product_id(response): prod_id = re.findall(':productdetails:([\da-zA-Z\-\.]{1,50})\",', response.body_as_unicode()) if prod_id: return prod_id[0] def parse_product(self, response): prod = response.meta.get('product', SiteProductItem()) prod['_subitem'] = True _ranking = response.meta.get('_ranking', None) prod['ranking'] = _ranking prod['url'] = response.url cond_set(prod, 'title', response.css('h1 ::text').extract()) prod['price'] = DellProductSpider._parse_price(response) prod['image_url'] = DellProductSpider._parse_image(response) prod['description'] = DellProductSpider._parse_description(response) prod['brand'] = DellProductSpider._parse_brand(response, prod.get('title', '')) prod['related_products'] = self._related_products(response) response.meta['product'] = prod is_links, variants = self._parse_variants(response) if is_links: yield variants.pop(0) else: cond_set_value(prod, 'variants', self._collect_variants_from_dict(variants)) if 'This product is currently unavailable.' in response.body_as_unicode( ): prod['is_out_of_stock'] = True else: yield self._get_stock_status(response, prod) # this should be OOS field meta = {'product': prod} prod_id = self._get_product_id(response) if prod_id: # first page type if response.css('#bazaarVoice').extract(): meta.update({'br_page_type': 1}) yield Request( # reviews request url=self.REVIEW_URL.format(product_id=prod_id), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta) buyer_reviews_iframe_src = response.xpath( '//iframe[contains(@src,"reviews.htm")]/@src').extract() if buyer_reviews_iframe_src: # second page type meta.update({'br_page_type': 2}) yield Request( # reviews request url=buyer_reviews_iframe_src[0].replace('format=noscript', ''), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta) try: r_url, related_data = self.RELATED_PROD_URL_V1, self._collect_related_products_data_v1( response) except Exception: r_url, related_data = self.RELATED_PROD_URL_V2, self._collect_related_products_data_v2( response) yield Request( # related products request r_url.format(**related_data), callback=self._parse_related_products, meta=meta) yield prod def _collect_common_variants_data(self, response): data = self.VARIANTS_DATA.copy() _ = is_empty( response.xpath('//meta[@name="Country"]/@content').extract()) if _: data['c'] = _ _ = is_empty( response.xpath('//meta[@name="Language"]/@content').extract()) if _: data['l'] = _ _ = is_empty( response.xpath('//meta[@name="Segment"]/@content').extract()) if _: data['s'] = _ _ = is_empty( response.xpath('//meta[@name="CustomerSet"]/@content').extract()) if _: data['cs'] = _ _ = is_empty( response.xpath('//meta[@name="currentOcId"]/@content').extract()) if _: data['oc'] = _ else: self.log('No "OC" and/or "modId data found" <%s>' % response.url, WARNING) return None return data def _collect_specific_variants_data(self, variant, common_data): data = common_data.copy() oc = data.get('oc') if not oc: self.log('No OC data', ERROR) uniq_id = is_empty( variant.xpath( '//input[@value="%s"][contains(@id, "OrderCode")]/@id' % oc).extract()) uniq_id = uniq_id.replace('OrderCode', '') mod_id = is_empty( variant.xpath('.//span[contains(@class,"spec~%s~")]/@class' % uniq_id).extract()) mod_id = mod_id.split('~')[-1] data['modId'] = mod_id data['uiParameters'] = 'mainModuleId=%s&uniqueId=%s' % (mod_id, uniq_id) return data def _collect_variants_from_dict(self, variants): if not variants: return None max_options = 4 _variants = OrderedDict() keys = sorted(variants.keys()[:max_options]) for tmp in keys: _variants[tmp] = variants[tmp] options = product(*_variants.values()[:max_options]) data = [] for option in options: tmp = {} for i, key in enumerate(keys): tmp[key] = option[i] data.append( dict(in_stock=None, price=None, selected=None, properties=tmp)) return data def _parse_variant_data(self, response): json_resp = hjson.loads(response.body_as_unicode()) html = json_resp['ModulesHtml'] html = Selector(text=html) add_requests = response.meta.get('additional_requests') variants = response.meta['variants'] cur_var = response.meta['cur_variant'] choices = html.css('.catContent .optDescription::text').extract() variants[cur_var] = choices if add_requests: next_request = add_requests.pop(0) return next_request vs = self._collect_variants_from_dict(variants) prod = response.meta['product'] prod['variants'] = vs return prod def _parse_variants(self, response): variants_exist = bool(response.css('#Configurations').extract()) if variants_exist: common_req_params = self._collect_common_variants_data(response) variants_names = response.xpath( '//div[contains(@class, "specContent")]') data = {} additional_requests = [] for v_n in variants_names: k = is_empty( v_n.xpath( 'normalize-space(preceding-sibling::div[@class="specTitle"][1]/h5/text())' ).extract()) v = ' '.join(v_n.xpath('span/text()').extract()) is_ajax = bool(v_n.xpath('div[@class="dropdown"]').extract()) if is_ajax: form_data = self._collect_specific_variants_data( v_n, common_req_params) meta = response.meta.copy() meta['variants'] = data meta['cur_variant'] = k meta['additional_requests'] = additional_requests meta['product'] = response.meta['product'] additional_requests.append( FormRequest(self.VARIANTS_URL, callback=self._parse_variant_data, formdata=form_data, meta=meta)) else: data[k] = [v] if additional_requests: return True, additional_requests else: return False, data return None, None def _collect_related_products_data_v1(self, response): data = dict() cur_date = datetime.now() js_node = response.xpath( '//div[@id="mbox_default"]/following-sibling::script[1]') js_data = js_node.xpath('following-sibling::script[1]/text()').re( 'profile = (\{.*\})') js_data = hjson.loads(js_data[0]) data['p'] = is_empty( response.css('meta[name=currentOcId]::attr(content)').extract()) data['date'] = cur_date.today().strftime('%Y%m%d') data['ts'] = '%s000' % int(time.mktime(cur_date.timetuple())) data['n'] = js_data['catid'] data['chi'] = is_empty( js_node.xpath('text()').re("'profile.catid=(.*?)'")) return data def _collect_related_products_data_v2(self, response): data = dict() js_data = response.xpath( 'normalize-space(/html/head/script[@type="text/javascript"][1]/text())' ).re('\{.*\}') js_data = hjson.loads(js_data[0]) cur_date = datetime.now() data['date'] = cur_date.today().strftime('%Y%m%d') data['ts'] = '%s000' % int(time.mktime(cur_date.timetuple())) data['p'] = js_data['CJ']['ORDERCODE'].lower() return data def _parse_related_products(self, response): prod = response.meta['product'] html = re.search(r"html:'(.+?)'\}\]\},", response.body_as_unicode()) if not html: return prod html = Selector(text=html.group(1)) key_name = is_empty(html.css('.rrStrat::text').extract()) items = html.css('.rrRecs > ul > li') rel_prods = [] for item in items: title = is_empty(item.css('.rrItemName > a ::text').extract()) url = is_empty(item.css('a.rrLinkUrl::attr(href)').extract()) url = urlparse.urlparse(url) qs = urlparse.parse_qs(url.query) url = is_empty(qs['ct']) rel_prods.append(RelatedProduct(title=title, url=url)) prod['related_products'] = {key_name: rel_prods} return prod
def __init__(self, *args, **kwargs): super(ToysrusProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs) self.br = BuyerReviewsBazaarApi(called_class=self)
class CvsProductsSpider(BaseProductsSpider): name = 'cvs_products' allowed_domains = ["cvs.com", "api.bazaarvoice.com"] start_urls = [] SEARCH_URL = "https://www.cvs.com/search/N-0?searchTerm={search_term}" SEARCH_URL_AJAX = "https://www.cvs.com/" \ "retail/frontstore/OnlineShopService?" \ "apiKey=c9c4a7d0-0a3c-4e88-ae30-ab24d2064e43&" \ "apiSecret=4bcd4484-c9f5-4479-a5ac-9e8e2c8ad4b0&" \ "appName=CVS_WEB&" \ "channelName=WEB&" \ "contentZone=resultListZone&" \ "deviceToken=7780&" \ "deviceType=DESKTOP&" \ "lineOfBusiness=RETAIL&" \ "navNum=20&" \ "operationName=getProductResultList&" \ "pageNum={page_num}&" \ "referer={referer}&" \ "serviceCORS=False&" \ "serviceName=OnlineShopService&" \ "sortBy=relevance&" \ "version=1.0" \ REVIEW_URL = "http://api.bazaarvoice.com/data/products.json?" \ "passkey=ll0p381luv8c3ler72m8irrwo&apiversion=5.5&" \ "filter=id:{product_id}&stats=reviews" PRICE_URL = "https://www.cvs.com/retail/frontstore/productDetails?" \ "apiKey=c9c4a7d0-0a3c-4e88-ae30-ab24d2064e43&" \ "apiSecret=4bcd4484-c9f5-4479-a5ac-9e8e2c8ad4b0&" \ "appName=CVS_WEB&" \ "channelName=WEB&" \ "code={sku}&" \ "codeType=sku&" \ "deviceToken=2695&" \ "deviceType=DESKTOP&" \ "lineOfBusiness=RETAIL&" \ "operationName=getSkuPricePromotions&" \ "serviceCORS=True&" \ "serviceName=productDetails&" \ "storeId=2294&" \ "version=1.0" \ PRODUCT_DETAILS = "https://www.cvs.com/retail/frontstore/productDetails?" \ "apiKey=c9c4a7d0-0a3c-4e88-ae30-ab24d2064e43&" \ "apiSecret=4bcd4484-c9f5-4479-a5ac-9e8e2c8ad4b0&" \ "appName=CVS_WEB&" \ "channelName=WEB&" \ "code={sku}&" \ "codeType=sku&" \ "deviceToken=2695&" \ "deviceType=DESKTOP&" \ "lineOfBusiness=RETAIL&" \ "operationName=getSkuDetails&" \ "serviceCORS=True&" \ "serviceName=productDetails&" \ "version=1.0" def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) self.referer = None self.first_time_products = None self.current_page = 1 self.products_per_page = 20 super(CvsProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs) settings.overrides['CRAWLERA_ENABLED'] = True def _set_brand(self, product, phrase, brands): phrase = _normalize(phrase) for brand in sorted(brands, key=len, reverse=True): if _normalize(brand) in phrase: cond_set_value(product, 'brand', brand) break def parse(self, response): print response.url if self.searchterms and not self.referer: self.referer = response.url return super(CvsProductsSpider, self).parse(response) def parse_product(self, response): brands = response.meta.get('brands', frozenset()) product = response.meta['product'] reqs = [] if 'brand' not in product: descs = response.css('.brandBanner > a ::attr(title)') if descs: desc, = descs.extract() self._set_brand(product, desc, brands) product['locale'] = "en-US" reseller_id_regex = "prodid-(\d+)" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id) ld_json = is_empty( response.xpath('//*[@type="application/ld+json" ' 'and contains(text(),"product")]/text()').extract()) if ld_json: try: clean_json = re.sub('([^"])\n|\t|\r', '', ld_json.replace('@', '')) product_data = json.loads(clean_json) cond_set_value(product, 'title', product_data.get('name')) cond_set_value(product, 'brand', product_data.get('brand')) ######## variants ######## variants = product_data.get('offers') if len(variants) > 1: for variant in variants: try: sku = variant['itemOffered']['sku'] price_url = self.PRICE_URL.format(sku=sku) reqs.append( Request(price_url, self._parse_variant_new, meta=response.meta)) except: pass main_variant = variants[0] description = main_variant.get( 'itemOffered', {}).get('description') or product_data.get('description') cond_set_value(product, 'description', description) main_skuID_search = re.search("skuId=(\d+)", response.url) if main_skuID_search: main_skuID = main_skuID_search.group(1) else: main_skuID = variants[0].get('itemOffered', {}).get('sku', None) cond_set_value(product, 'image_url', main_variant.get('itemOffered').get('image')) response.meta['main_skuID'] = main_skuID response.meta['offers_variants'] = variants if main_variant.get('price'): cond_set_value( product, 'price', Price(price=main_variant.get('price'), priceCurrency='USD')) # elif product_data.get('productId'): # price_url = self.PRICE_URL.format( # price_id=product_data.get('productId')) # reqs.append(Request(price_url, # self._parse_price, # meta=response.meta)) # cond_set_value(product, 'variants', # self._parse_variants(variants, main_skuID)) ############################## if main_skuID: review_url = self.REVIEW_URL.format(product_id=main_skuID) reqs.append( Request(review_url, self._parse_review, meta=response.meta)) except: import traceback print traceback.print_exc() size = response.xpath( "//form[@id='addCart']/table/tr/td[@class='col1']/" "text()[.='Size:']/../../td[2]/text()").extract() cond_set(product, 'model', size, conv=string.strip) if reqs: return self.send_next_request(reqs, response) return product def send_next_request(self, reqs, response): """ Helps to handle several requests """ req = reqs.pop(0) new_meta = response.meta.copy() if reqs: new_meta["reqs"] = reqs return req.replace(meta=new_meta) def _parse_variant_new(self, response): product = response.meta['product'] reqs = response.meta.get('reqs', []) data = json.loads(response.body) sku_price_promotions = data.get('response', {}).get('getSkuPricePromotions', []) if sku_price_promotions: sku_details = sku_price_promotions[0].get('skuDetails', []) if sku_details: variants = product.get('variants', []) variant = {} skuID = sku_details[0].get('skuId', '') variant['url'] = product.get('url', '') + "?skuId=%s" % skuID price = sku_details[0].get('priceInfo', {}).get('listPrice', None) if price: cond_set_value(product, 'price', Price(price=price, priceCurrency='USD')) variant['price'] = price main_skuID = response.meta['main_skuID'] variant['selected'] = main_skuID == skuID bohInventory = sku_details[0].get('statusInfo', {}).get('bohInventory', 0) bohStockStatus = sku_details[0].get('statusInfo', {}).get( 'bohStockStatus', 'NOTAVAILABLE') onlineOnly = sku_details[0].get('statusInfo', {}).get('onlineOnly', False) onlineStockStatus = sku_details[0].get('statusInfo', {}).get( 'onlineStockStatus', None) in_stock = False if bohInventory and bohStockStatus != 'NOTAVAILABLE': in_stock = True if onlineStockStatus == 'INSTOCK': in_stock = True variant['in_stock'] = in_stock variant['sku'] = skuID # del product['main_skuID'] variant['properties'] = {} offers_variants = response.meta['offers_variants'] for offers_variant in offers_variants: # Check that the variant is not duplicated item_offered = offers_variant.get('itemOffered', {}) this_sku = item_offered.get('sku', None) if item_offered and this_sku == skuID: attr = {} details_url = self.PRODUCT_DETAILS.format(sku=this_sku) variant['properties'] = attr reqs.append( Request(details_url, self._parse_properties, meta=response.meta)) break variants.append(variant) product['variants'] = variants if reqs: return self.send_next_request(reqs, response) return product def _parse_properties(self, response): product = response.meta['product'] reqs = response.meta.get('reqs', []) data = json.loads(response.body) getSkuDetails = data.get('response', {}).get('getSkuDetails', []) if getSkuDetails: sku_details = getSkuDetails[0].get('skuDetails', []) if len(sku_details) > 0: detail = sku_details[0]['detail'] skuSize = detail['skuSize'] weight = detail['weight'] flavor = detail['flavor'] upcNumber = detail['upcNumber'] variants = product.get('variants', []) skuID = sku_details[0].get('skuId', '') for idx, variant in enumerate(variants): # Check that the variant is not duplicated this_sku = variant.get('sku', None) if this_sku == skuID: attr = {} attr['Size'] = skuSize attr['Flavor'] = flavor attr['Weight'] = weight attr['UPCNumber'] = upcNumber variant['properties'] = attr variants[idx] = variant break product['variants'] = variants if reqs: return self.send_next_request(reqs, response) return product def _parse_variants(self, variants, main_skuID): if not variants: return None parsed_variants = [] variants_visit = set() for variant in variants: # Check that the variant is not duplicated item_offered = variant.get('itemOffered', {}) this_sku = item_offered.get('sku', None) if this_sku in variants_visit: continue variants_visit.add(this_sku) # Fill the Variant data vr = {} if variant['price']: vr['price'] = variant['price'] availability = variant.get('availability', None) vr['in_stock'] = availability == "http://schema.org/InStock" vr['selected'] = main_skuID == this_sku if item_offered: attr = {} if item_offered.get('color'): attr['Color'] = item_offered.get('color') if item_offered.get('color'): attr['Weight'] = item_offered.get('weight').get('value') vr['properties'] = attr vr['url'] = item_offered.get('url') parsed_variants.append(vr) parsed_variants[0]['selected'] = True return parsed_variants def _parse_review(self, response): product = response.meta['product'] reqs = response.meta.get('reqs', []) product['buyer_reviews'] = self.br.parse_buyer_reviews_products_json( response) if reqs: return self.send_next_request(reqs, response) return product def _scrape_total_matches(self, response): totals = response.xpath( '//*[@id="resultsTabs"]//' 'a[@title="View Products"]/text()').re('\((\d+)\)') if len(totals) > 1: self.log( "Found more than one 'total matches' for %s" % response.url, ERROR) elif totals: total = totals[0].strip() self.total_matches_int = int(total) return int(total) else: self.log("Failed to find 'total matches' for %s" % response.url, WARNING) return None def _scrape_product_links(self, response): all_links_iter = re.finditer( 'detailsLink"\s*:\s*"(.*?)(\?skuId=\d+)?",', response.body) # Clean the links for the different variants of a product links_without_dup = [] [ links_without_dup.append(item) for item in map((lambda x: x.group(1)), all_links_iter) if item not in links_without_dup ] for link in links_without_dup: yield link, SiteProductItem() def _scrape_results_per_page(self, response): return 20 def _scrape_next_results_page_link(self, response): url_parts = urlparse.urlsplit(response.url) query_string = urlparse.parse_qs(url_parts.query) ajax_search_url = self.SEARCH_URL_AJAX.format( referer=urllib.quote_plus(self.referer, ':'), page_num=self.current_page) self.current_page += 1 if self.current_page * self.products_per_page > self.total_matches_int + 30: return headers = { 'Accept': 'application/json, text/plain, */*', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.cvs.com', 'Pragma': 'no-cache', 'Referer': self.referer, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)' ' AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/49.0.2623.110 Safari/537.36' } return Request(ajax_search_url, self.parse, headers=headers, meta=response.meta, priority=1) def _parse_single_product(self, response): return self.parse_product(response) def _get_products(self, response): remaining = response.meta['remaining'] search_term = response.meta['search_term'] prods_per_page = response.meta.get('products_per_page') total_matches = response.meta.get('total_matches') scraped_results_per_page = response.meta.get( 'scraped_results_per_page') prods = self._scrape_product_links(response) if not prods_per_page: # Materialize prods to get its size. prods = list(prods) prods_per_page = len(prods) response.meta['products_per_page'] = prods_per_page if scraped_results_per_page is None: scraped_results_per_page = self._scrape_results_per_page(response) if scraped_results_per_page: self.log( "Found %s products at the first page" % scraped_results_per_page, INFO) else: scraped_results_per_page = prods_per_page if hasattr(self, 'is_nothing_found'): if not self.is_nothing_found(response): self.log( "Failed to scrape number of products per page", ERROR) response.meta[ 'scraped_results_per_page'] = scraped_results_per_page if total_matches is None: total_matches = self._scrape_total_matches(response) if total_matches is not None: response.meta['total_matches'] = total_matches self.log("Found %d total matches." % total_matches, INFO) else: if hasattr(self, 'is_nothing_found'): if not self.is_nothing_found(response): self.log( "Failed to parse total matches for %s" % response.url, ERROR) if total_matches and not prods_per_page: # Parsing the page failed. Give up. self.log("Failed to get products for %s" % response.url, ERROR) return for i, (prod_url, prod_item) in enumerate(islice(prods, 0, remaining)): # Initialize the product as much as possible. prod_item['site'] = self.site_name prod_item['search_term'] = search_term prod_item['total_matches'] = total_matches prod_item['results_per_page'] = prods_per_page prod_item['scraped_results_per_page'] = scraped_results_per_page # The ranking is the position in this page plus the number of # products from other pages. prod_item['ranking'] = (i + 1) + (self.quantity - remaining) if self.user_agent_key not in ["desktop", "default"]: prod_item['is_mobile_agent'] = True if prod_url is None: # The product is complete, no need for another request. yield prod_item elif isinstance(prod_url, Request): cond_set_value(prod_item, 'url', prod_url.url) # Tentative. yield prod_url else: # Another request is necessary to complete the product. url = urlparse.urljoin(response.url, prod_url) cond_set_value(prod_item, 'url', url) # Tentative. yield Request(url, callback=self.parse_product, meta={'product': prod_item})
class LeviProductsSpider(BaseValidator, BaseProductsSpider): name = 'levi_products' allowed_domains = ["levi.com", "www.levi.com"] start_urls = [] settings = LeviValidatorSettings SEARCH_URL = "http://www.levi.com/US/en_US/search?Ntt={search_term}" # TODO: ordering PAGINATE_URL = ( 'http://www.levi.com/US/en_US/includes/searchResultsScroll/?nao={nao}' '&url=%2FUS%2Fen_US%2Fsearch%3FD%3D{search_term}%26Dx' '%3Dmode%2Bmatchall%26N%3D4294960840%2B4294961101%2B4294965619%26Ntk' '%3DAll%26Ntt%3Ddress%26Ntx%3Dmode%2Bmatchall') CURRENT_NAO = 0 PAGINATE_BY = 12 # 12 products TOTAL_MATCHES = None # for pagination REVIEW_URL = "http://levistrauss.ugc.bazaarvoice.com/9090-en_us/" \ "{product_id}/reviews.djs?format=embeddedhtml&page={index}&" RELATED_PRODUCT = "http://www.res-x.com/ws/r2/Resonance.aspx?" \ "appid=levi01&tk=811541814822703" \ "&ss=544367773691192" \ "&sg=1&" \ "&vr=5.3x&bx=true" \ "&sc=product4_rr" \ "&sc=product3_rr" \ "&sc=product1_r" \ "r&sc=product2_rr" \ "&ev=product&ei={product_id}" \ "&no=20" \ "&language=en_US" \ "&cb=certonaResx.showResponse" \ "&ur=http%3A%2F%2Fwww.levi.com%2FUS%2Fen_US%" \ "2Fwomens-jeans%2Fp%2F095450043&plk=&" handle_httpstatus_list = [404] use_proxies = True def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) super(LeviProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs) def _parse_single_product(self, response): return self.parse_product(response) def parse_product(self, response): product = response.meta.get('product', SiteProductItem()) if response.status == 404 or 'This product is no longer available' in response.body_as_unicode() \ or "www.levi.com/US/en_US/error" in response.url: product.update({"not_found": True}) product.update({"no_longer_available": True}) return product reqs = [] # product id self.product_id = is_empty( response.xpath('//meta[@itemprop="model"]/@content').extract()) # product data in json self.js_data = self.parse_data(response) # Parse locate locale = 'en_US' cond_set_value(product, 'locale', locale) # Parse model cond_set_value(product, 'model', self.product_id) # Parse title title = self.parse_title(response) cond_set(product, 'title', title) # Parse image image = self.parse_image(response) cond_set_value(product, 'image_url', image) # Parse brand brand = self.parse_brand(response) cond_set_value(product, 'brand', brand) # Parse upc upc = self.parse_upc(response) cond_set_value(product, 'upc', upc) # Parse sku sku = self.parse_sku(response) cond_set_value(product, 'sku', sku) # Parse description description = self.parse_description(response) cond_set_value(product, 'description', description) # Parse price price = self.parse_price(response) cond_set_value(product, 'price', price) reseller_id_regex = "p\/(\d+)" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id) try: variants = self._parse_variants(response) except KeyError: product['not_found'] = True return product # set reseller_id for variants as well for variant in variants: v_url = variant.get('url') if v_url: reseller_id = re.findall(reseller_id_regex, v_url) reseller_id = reseller_id[0] if reseller_id else None else: reseller_id = None variant['reseller_id'] = reseller_id product['variants'] = variants response.meta['marks'] = {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0} real_count = is_empty( re.findall(r'<span itemprop="reviewCount">(\d+)<\/span>', response.body_as_unicode())) if real_count: # Parse buyer reviews if int(real_count) > 8: for index, i in enumerate(xrange(9, int(real_count) + 1, 30)): reqs.append( Request(url=self.REVIEW_URL.format( product_id=self.product_id, index=index + 2), dont_filter=True, callback=self.parse_buyer_reviews)) reqs.append( Request(url=self.REVIEW_URL.format(product_id=self.product_id, index=0), dont_filter=True, callback=self.parse_buyer_reviews)) if reqs: return self.send_next_request(reqs, response) return product def _parse_variants(self, response): """ Parses product variants. """ lv = LeviVariants() lv.setupSC(response) variants = lv._variants() return variants def parse_buyer_reviews(self, response): meta = response.meta.copy() buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response) for k, v in buyer_reviews_per_page['rating_by_star'].iteritems(): response.meta['marks'][k] += v product = response.meta['product'] reqs = meta.get('reqs') product['buyer_reviews'] = BuyerReviews( num_of_reviews=buyer_reviews_per_page['num_of_reviews'], average_rating=buyer_reviews_per_page['average_rating'], rating_by_star=response.meta['marks']) if reqs: reqs.append( Request(url=self.RELATED_PRODUCT.format( product_id=self.product_id, index=0), dont_filter=True, callback=self.parse_related_product)) if reqs: return self.send_next_request(reqs, response) return product def send_next_request(self, reqs, response): """ Helps to handle several requests """ req = reqs.pop(0) new_meta = response.meta.copy() if reqs: new_meta["reqs"] = reqs return req.replace(meta=new_meta) def parse_brand(self, response): brand = is_empty( response.xpath('//meta[@itemprop="brand"]/@content').extract()) return brand def parse_title(self, response): title = response.xpath( '//meta[contains(@property, "og:title")]/@content').extract() if title: title = [title[0].replace('™', '').replace('\u2122', '')] else: title = response.xpath( '//h1[contains(@class, "title")]/text()').extract() return title def parse_data(self, response): data = re.findall(r'var buyStackJSON = \'(.+)\'; ', response.body_as_unicode()) if data: data = re.sub(r'\\(.)', r'\g<1>', data[0]) try: js_data = json.loads(data) except: return return js_data def parse_image(self, response): if self.js_data: try: image = self.js_data['colorid'][self.product_id]['gridUrl'] except: return return image def parse_related_product(self, response): related_prods = [] product = response.meta['product'] sample = response.body_as_unicode() try: sample = sample.replace(u'certonaResx.showResponse(', '') sample = sample[:-2] data = json.loads(sample) html = data['Resonance']['Response'][2]['output'] except Exception as e: self.log( 'Error during parsing related products page: {}'.format(e)) return product else: s = Selector(text=html) titles = s.xpath('//h4/text()').extract() # Title urls = s.xpath('//img/@src').extract() # Img url for title, url in zip(titles, urls): if url and title: related_prods.append(RelatedProduct(title=title, url=url)) product['related_products'] = {} if related_prods: product['related_products'][ 'buyers_also_bought'] = related_prods return product def parse_description(self, response): if self.js_data: try: description = self.js_data['colorid'][self.product_id]['name'] except: return return description def parse_upc(self, response): if self.js_data: for v in self.js_data['sku'].values(): upc = v['upc'] upc = upc[-12:] return upc def parse_sku(self, response): if self.js_data: for v in self.js_data['sku'].values(): skuid = v['skuid'] return skuid def parse_price(self, response): if self.js_data: price = self.js_data['colorid'][self.product_id]['price'] for price_data in price: if price_data['il8n'] == 'now': price = price_data['amount'] currency = is_empty( re.findall(r'currency":"(\w+)"', response.body_as_unicode())) if price and currency: price = Price(price=price, priceCurrency=currency) else: price = Price(price=0.00, priceCurrency="USD") return price def _scrape_total_matches(self, response): totals = response.css('.productCount ::text').extract() if totals: totals = totals[0].replace(',', '').replace('.', '').strip() if totals.isdigit(): if not self.TOTAL_MATCHES: self.TOTAL_MATCHES = int(totals) return int(totals) def _scrape_product_links(self, response): for link in response.xpath( '//li[contains(@class, "product-tile")]' '//a[contains(@rel, "product")]/@href').extract(): yield link, SiteProductItem() def _get_nao(self, url): nao = re.search(r'nao=(\d+)', url) if not nao: return return int(nao.group(1)) def _replace_nao(self, url, new_nao): current_nao = self._get_nao(url) if current_nao: return re.sub(r'nao=\d+', 'nao=' + str(new_nao), url) else: return url + '&nao=' + str(new_nao) def _scrape_next_results_page_link(self, response): if self.TOTAL_MATCHES is None: self.log('No "next result page" link!') return if self.CURRENT_NAO > self.TOTAL_MATCHES + self.PAGINATE_BY: return # it's over self.CURRENT_NAO += self.PAGINATE_BY return Request(self.PAGINATE_URL.format( search_term=response.meta['search_term'], nao=str(self.CURRENT_NAO)), callback=self.parse, meta=response.meta)
class CostcoProductsSpider(BaseProductsSpider): name = "costco_products" allowed_domains = ["costco.com"] start_urls = [] SEARCH_URL = "http://www.costco.com/CatalogSearch?pageSize=96" \ "&catalogId=10701&langId=-1&storeId=10301" \ "¤tPage=1&keyword={search_term}" selenium_retries = 5 DEFAULT_CURRENCY = u'USD' REVIEW_URL = 'http://api.bazaarvoice.com/data/products.json?passkey=bai25xto36hkl5erybga10t99&apiversion=5.5&filter=id:{product_id}&stats=reviews' def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) super(CostcoProductsSpider, self).__init__( site_name=self.allowed_domains[0], *args, **kwargs) def _parse_single_product(self, response): return self.parse_product(response) def parse_product(self, response): prod = response.meta['product'] meta = response.meta.copy() reqs = [] meta['reqs'] = reqs # TODO since response.body is already downloaded by scrapy # may try to run it in selenium instead of downloading the page again selenium_html = self._get_page_html_selenium(response.url) # TODO might as well use that html to extract other data for x in range(self.selenium_retries - 1): if not selenium_html: selenium_html = self._get_page_html_selenium(response.url) else: break if selenium_html: price = Selector(text=selenium_html).xpath( './/*[contains(@class, "your-price")]/span[@class="value"]/text()').extract() cond_set_value(prod, 'price', Price(priceCurrency=self.DEFAULT_CURRENCY, price=price)) # not longer available no_longer_available = response.xpath( '//*[@class="server-error" and contains(text(),' '"out of stock and cannot be added to your cart at this time")]') cond_set_value(prod, 'no_longer_available', 1 if no_longer_available else 0) if not no_longer_available and response.xpath('//h1[text()="Product Not Found"]'): prod['not_found'] = True return prod model = response.xpath('//div[@id="product-tab1"]//text()').re( 'Model[\W\w\s]*') if len(model) > 0: cond_set(prod, 'model', model) if 'model' in prod: prod['model'] = re.sub(r'Model\W*', '', prod['model'].strip()) title = response.xpath('//h1[@itemprop="name"]/text()').extract() cond_set(prod, 'title', title) # Title key must be present even if it is blank cond_set_value(prod, 'title', "") tab2 = ''.join( response.xpath('//div[@id="product-tab2"]//text()').extract() ).strip() brand = '' for i in tab2.split('\n'): if 'Brand' in i.strip(): brand = i.strip() brand = re.sub(r'Brand\W*', '', brand) if brand: prod['brand'] = brand if not prod.get("brand"): brand = response.xpath( './/*[contains(text(), "Brand:")]/following-sibling::text()[1]').extract() brand = brand[0].strip() if brand else None cond_set_value(prod, 'brand', brand) des = response.xpath('//div[@id="product-tab1"]//text()').extract() des = ' '.join(i.strip() for i in des) if '[ProductDetailsESpot_Tab1]' in des.strip(): des = response.xpath("//div[@id='product-tab1']/*[position()>1]//text()").extract() des = ' '.join(i.strip() for i in des) if des.strip(): prod['description'] = des.strip() elif des: prod['description'] = des.strip() img_url = response.xpath('//img[@itemprop="image"]/@src').extract() cond_set(prod, 'image_url', img_url) cond_set_value(prod, 'locale', 'en-US') prod['url'] = response.url # Categories categorie_filters = ['home'] # Clean and filter categories names from breadcrumb categories = list(filter((lambda x: x.lower() not in categorie_filters), map((lambda x: x.strip()), response.xpath('//*[@itemprop="breadcrumb"]//a/text()').extract()))) category = categories[-1] if categories else None cond_set_value(prod, 'categories', categories) cond_set_value(prod, 'category', category) # Minimum Order Quantity try: minium_order_quantity = re.search('Minimum Order Quantity: (\d+)', response.body_as_unicode()).group(1) cond_set_value(prod, 'minimum_order_quantity', minium_order_quantity) except: pass shipping = ''.join(response.xpath( '//*[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ",' ' "abcdefghijklmnopqrstuvwxyz"), "shipping & handling:")]' ).re('[\d\.\,]+')).strip().replace(',', '') if not shipping: shipping = ''.join(response.xpath( '//*[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ",' ' "abcdefghijklmnopqrstuvwxyz"), "shipping and handling:")]' ).re('[\d\.\,]+')).strip().replace(',', '') if shipping: cond_set_value(prod, 'shipping_cost', Price(priceCurrency=self.DEFAULT_CURRENCY, price=shipping)) shipping_included = ''.join(response.xpath( '//*[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ",' ' "abcdefghijklmnopqrstuvwxyz"),"shipping & handling included")]' ).extract()).strip().replace(',', '') or \ response.xpath( '//*[@class="merchandisingText" and ' 'contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ", ' '"abcdefghijklmnopqrstuvwxyz"), "free shipping")]') or \ ''.join(response.xpath( '//p[contains(translate(text(), "ABCDEFGHIJKLMNOPQRSTUVWXYZ",' ' "abcdefghijklmnopqrstuvwxyz"),"shipping and handling included")]' ).extract()).strip().replace(',', '') cond_set_value(prod, 'shipping_included', 1 if shipping_included or shipping == "0.00" else 0) available_store = re.search('Item may be available in your local warehouse', response.body_as_unicode()) cond_set_value(prod, 'available_store', 1 if available_store else 0) not_available_store = re.search('Not available for purchase on Costco.com', response.body_as_unicode()) cond_set_value(prod, 'available_online', 0 if not_available_store else 1) if str(prod.get('available_online', None)) == '0' and str(prod.get('available_store', None)) == '0': prod['is_out_of_stock'] = True count_review = response.xpath('//meta[contains(@itemprop, "reviewCount")]/@content').extract() product_id = re.findall(r'\.(\d+)\.', response.url) cond_set_value(prod, 'reseller_id', product_id[0] if product_id else None) if product_id and count_review: reqs.append( Request( url=self.REVIEW_URL.format(product_id=product_id[0], index=0), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta )) if reqs: return self.send_next_request(reqs, response) return prod def parse_buyer_reviews(self, response): meta = response.meta.copy() product = response.meta['product'] reqs = meta.get('reqs', []) product['buyer_reviews'] = self.br.parse_buyer_reviews_products_json(response) if reqs: return self.send_next_request(reqs, response) else: return product def send_next_request(self, reqs, response): """ Helps to handle several requests """ req = reqs.pop(0) new_meta = response.meta.copy() if reqs: new_meta["reqs"] = reqs return req.replace(meta=new_meta) def _search_page_error(self, response): if not self._scrape_total_matches(response): self.log("Costco: unable to find a match", ERROR) return True return False def _scrape_total_matches(self, response): count = response.xpath( '//*[@id="secondary_content_wrapper"]/div/p/span/text()' ).re('(\d+)') count = int(count[-1]) if count else None if not count: count = response.xpath( '//*[@id="secondary_content_wrapper"]' '//span[contains(text(), "Showing results")]/text()' ).extract() count = int(count[0].split(' of ')[1].replace('.', '').strip()) if count else None if not count: count = response.css(".table-cell.results.hidden-xs.hidden-sm.hidden-md>span").re( r"Showing\s\d+-\d+\s?of\s?([\d.,]+)") count = int(count[0].replace('.', '').replace(',', '')) if count else None return count def _scrape_product_links(self, response): links = response.xpath( '//div[contains(@class,"product-list grid")]//a[contains(@class,"thumbnail")]/@href' ).extract() for link in links: yield link, SiteProductItem() def _scrape_next_results_page_link(self, response): links = response.xpath( "//*[@class='pagination']" "/ul[2]" # [1] is for the Items Per Page section which has .active. "/li[@class='active']" "/following-sibling::li[1]" # [1] is to get just the next sibling. "/a/@href" ).extract() if links: link = links[0] else: link = None return link def _get_page_html_selenium(self, url): try: display = Display(visible=False) display.start() driver = self._init_chromium() driver.set_page_load_timeout(120) driver.set_script_timeout(120) socket.setdefaulttimeout(120) driver.set_window_size(1280, 768) driver.get(url) time.sleep(5) page_html = driver.page_source driver.quit() except Exception as e: self.log('Exception while getting page html with selenium: {}'.format(e), WARNING) self.log('### Traceback: {}'.format(traceback.format_exc()), WARNING) else: return page_html def _init_chromium(self, proxy=None, proxy_type=None): # TODO use random useragent script here? # UA = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0" chrome_flags = webdriver.DesiredCapabilities.CHROME # this is for Chrome? chrome_options = webdriver.ChromeOptions() # this is for Chromium if proxy: chrome_options.add_argument( '--proxy-server=%s' % proxy_type+'://'+proxy) # chrome_flags["chrome.switches"] = ['--user-agent=%s' % UA] # chrome_options.add_argument('--user-agent=%s' % UA) executable_path = '/usr/sbin/chromedriver' if not os.path.exists(executable_path): executable_path = '/usr/local/bin/chromedriver' # initialize webdriver driver = webdriver.Chrome(desired_capabilities=chrome_flags, chrome_options=chrome_options, executable_path=executable_path) return driver
class OfficedepotProductsSpider(BaseProductsSpider): name = 'officedepot_products' allowed_domains = [ "officedepot.com", "www.officedepot.com", 'bazaarvoice.com' ] start_urls = [] _extra_requests = False # settings = DockersValidatorSettings SEARCH_URL = "http://www.officedepot.com/catalog/search.do?Ntt={search_term}&searchSuggestion=true&akamai-feo=off" PAGINATE_URL = ( 'http://www.officedepot.com/catalog/search.do?Ntx=mode+matchpartialmax&Nty=1&Ntk=all' '&Ntt={search_term}&N=5&recordsPerPageNumber=24&No={nao}') CURRENT_NAO = 0 PAGINATE_BY = 24 # 24 products TOTAL_MATCHES = None # for pagination REVIEW_URL = "http://officedepot.ugc.bazaarvoice.com/2563" \ "/{product_id}/reviews.djs?format=embeddedhtml" VARIANTS_URL = 'http://www.officedepot.com/mobile/getSkuAvailable' \ 'Options.do?familyDescription={name}&sku={sku}&noLogin=true' QA_URL = "http://officedepot.ugc.bazaarvoice.com/answers/2563/product/{product_id}/questions.djs?format=embeddedhtml" # # RELATED_PRODUCT = "http://www.res-x.com/ws/r2/Resonance.aspx?" \ # "appid=dockers01&tk=187015646137297" \ # "&ss=182724939426407" \ # "&sg=1&" \ # "&vr=5.3x&bx=true" \ # "&sc=product4_rr" \ # "&sc=product3_rr" \ # "&sc=product1_r" \ # "r&sc=product2_rr" \ # "&ev=product&ei={product_id}" \ # "&no=20" \ # "&language=en_US" \ # "&cb=certonaResx.showResponse" \ # "&ur=http%3A%2F%2Fwww.levi.com%2FUS%2Fen_US%" \ # "2Fwomens-jeans%2Fp%2F095450043&plk=&" def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) # officedepot seems to have a bot protection, so we first get the cookies # and parse the site with them after that self.proxy = None self.timeout = 60 self.width = 1024 self.height = 768 self.selenium_cookies = {} self.user_agent = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36') socket.setdefaulttimeout(60) self._get_selenium_cookies_for_main_page() if kwargs.get('scrape_variants_with_extra_requests'): self._extra_requests = True super(OfficedepotProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs) def _prepare_driver(self, driver): driver.set_page_load_timeout(int(self.timeout)) driver.set_script_timeout(int(self.timeout)) driver.set_window_size(int(self.width), int(self.height)) def _get_selenium_cookies_for_main_page(self): from pyvirtualdisplay import Display display = Display(visible=False) display.start() driver = self._init_chromium() self._prepare_driver(driver) try: driver.get('http://' + self.allowed_domains[0]) time.sleep(10) for cookie in driver.get_cookies(): self.selenium_cookies[cookie['name']] = cookie['value'] driver.quit() except Exception as e: driver.quit() time.sleep(10) self.log( 'Error getting cookies from homepage, trying one more time: %s' % str(e)) driver.get('http://' + self.allowed_domains[0]) time.sleep(10) for cookie in driver.get_cookies(): self.selenium_cookies[cookie['name']] = cookie['value'] try: driver.quit() display.stop() except Exception as e: self.log('Error on driver & display destruction: %s' % str(e)) def _init_chromium(self): from selenium import webdriver from selenium.webdriver.remote.remote_connection import RemoteConnection RemoteConnection.set_timeout(30) chrome_flags = webdriver.DesiredCapabilities.CHROME # this is for Chrome? chrome_options = webdriver.ChromeOptions() # this is for Chromium if self.proxy: chrome_options.add_argument('--proxy-server=%s' % self.proxy_type + '://' + self.proxy) chrome_flags["chrome.switches"] = ['--user-agent=%s' % self.user_agent] chrome_options.add_argument('--user-agent=%s' % self.user_agent) executable_path = '/usr/sbin/chromedriver' if not os.path.exists(executable_path): executable_path = '/usr/local/bin/chromedriver' # initialize webdriver, open the page and make a screenshot driver = webdriver.Chrome(desired_capabilities=chrome_flags, chrome_options=chrome_options, executable_path=executable_path) return driver def _init_firefox(self): from selenium import webdriver from selenium.webdriver.remote.remote_connection import RemoteConnection RemoteConnection.set_timeout(30) profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", self.user_agent) profile.set_preference("network.proxy.type", 1) # manual proxy configuration if self.proxy: if 'socks' in self.proxy_type: profile.set_preference("network.proxy.socks", self.proxy.split(':')[0]) profile.set_preference("network.proxy.socks_port", int(self.proxy.split(':')[1])) else: profile.set_preference("network.proxy.http", self.proxy.split(':')[0]) profile.set_preference("network.proxy.http_port", int(self.proxy.split(':')[1])) profile.update_preferences() driver = webdriver.Firefox(profile) return driver def _parse_single_product(self, response): return self.parse_product(response) @staticmethod def _get_product_id(url): match = re.search(r'/products/(\d{2,20})/', url) if match: return match.group(1) def parse_product(self, response): meta = response.meta product = meta.get('product', SiteProductItem()) reqs = [] meta['reqs'] = reqs product['_subitem'] = True # Parse locate locale = 'en_US' cond_set_value(product, 'locale', locale) # Parse title title = self.parse_title(response) cond_set(product, 'title', title, conv=string.strip) # Parse image image = self.parse_image(response) cond_set(product, 'image_url', image) # Parse brand brand = self.parse_brand(response) cond_set_value(product, 'brand', brand) # Parse sku sku = self.parse_sku(response) cond_set_value(product, 'sku', sku) # Parse description description = self.parse_description(response) cond_set_value(product, 'description', description) # Parse price price = self.parse_price(response) cond_set_value(product, 'price', price) # Parse model model = self._parse_model(response) cond_set_value(product, 'model', model) # Parse reseller_id reseller_id = self.parse_reseller_id(response) cond_set_value(product, "reseller_id", reseller_id) # Parse is out of stock oos = self._parse_is_out_of_stock(response) cond_set_value(product, 'is_out_of_stock', oos) # Parse categories and category categories = self._parse_categories(response) cond_set_value(product, 'categories', categories) if categories: cond_set_value(product, 'category', categories[-1]) # Parse related products related_product = self._parse_related_product(response) cond_set_value(product, 'related_products', related_product) br_count = is_empty( re.findall(r'<span itemprop="reviewCount">(\d+)<\/span>', response.body_as_unicode())) meta['_br_count'] = br_count meta['product'] = product reqs.append( Request(url=self.REVIEW_URL.format( product_id=self._get_product_id(response.url)), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta)) sku = is_empty(response.xpath('//input[@name="id"]/@value').extract()) name = is_empty( response.xpath('//h1[@itemprop="name"]/text()').re('(.*?),')) if sku and name and self.scrape_variants_with_extra_requests: name = urllib.quote_plus(name.strip().encode('utf-8')) reqs.append( Request(url=self.VARIANTS_URL.format(name=name, sku=sku), callback=self._parse_variants, meta=meta)) # parse questions & answers reqs.append( Request(url=self.QA_URL.format( product_id=self._get_product_id(response.url)), callback=self._parse_questions, meta=meta, dont_filter=True)) if reqs: return self.send_next_request(reqs, response) return product def parse_reseller_id(self, response): regex = "\/(\d+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None return reseller_id def _parse_questions(self, response): meta = response.meta reqs = response.meta['reqs'] product = response.meta['product'] qa = [] questions_ids_regex = """BVQAQuestionSummary.+?javascript:void.+?>([^<]+)[^"']+["']BVQAQuestionMain(\d+)(?:.+?BVQAQuestionDetails.+?div>([^<]+)?).+?BVQAElapsedTime.+?>([^<]+)""" questions_ids = re.findall(questions_ids_regex, response.body_as_unicode()) for (question_summary, question_id, question_details, question_date) in questions_ids: # Convert date format if question_date: try: from dateutil.relativedelta import relativedelta years = re.findall("(\d+?)\s+?years", question_date) years = years[0] if years else '0' years = int(years) if years.isdigit() else '0' months = re.findall("(\d+?)\s+?months", question_date) months = months[0] if months else '0' months = int(months) if months.isdigit() else '0' if not months and not years: converted_date = None else: converted_date = datetime.now() - relativedelta( years=years, months=months) converted_date = converted_date.strftime("%Y-%m-%d") except Exception as e: converted_date = None self.log( 'Failed to parse date, setting date to None {}'.format( e)) else: converted_date = None # regex to get part of response that contain all answers to question with given id text_r = "BVQAQuestion{}Answers(.+?)BVQAQuestionDivider".format( question_id) all_a_text = re.findall(text_r, response.body_as_unicode()) all_a_text = ''.join(all_a_text[0]) if all_a_text else '' answers_regex = r"Answer:.+?>([^<]+)" answers = re.findall(answers_regex, all_a_text) answers = [{'answerText': a} for a in answers] question = { 'questionDate': converted_date, 'questionId': question_id, 'questionDetail': question_details.strip() if question_details else '', 'qestionSmmary': question_summary.strip() if question_summary else '', 'answers': answers, 'totalAnswersCount': len(answers) } qa.append(question) product['all_questions'] = qa if reqs: return self.send_next_request(reqs, response) return product def clear_text(self, str_result): return str_result.replace("\t", "").replace("\n", "").replace( "\r", "").replace(u'\xa0', ' ').strip() def _parse_is_out_of_stock(self, response): oos = response.xpath('//*[@itemprop="availability"' ' and @content="http://schema.org/OutOfStock"]') return bool(oos) def _parse_model(self, response): model = response.xpath( '//*[@id="attributemodel_namekey"]/text()').extract() if model: return model[0].strip() def _parse_categories(self, response): categories = response.xpath('//*[@id="siteBreadcrumb"]//' 'span[@itemprop="name"]/text()').extract() return categories def _parse_related_product(self, response): results = [] base_url = response.url for related_product in response.xpath( '//*[@id="relatedItems"]' '//tr[contains(@class,"hproduct")]' '/td[@class="description"]/a'): name = is_empty(related_product.xpath('text()').extract()) url = is_empty(related_product.xpath('@href').extract()) if name and url: results.append( RelatedProduct(title=name, url=urlparse.urljoin(base_url, url))) return results def _parse_variants(self, response): """ Parses product variants. """ reqs = response.meta['reqs'] product = response.meta['product'] data = json.loads(response.body) variants = [] if data.get('success'): for sku in data.get('skus', []): vr = {} vr['url'] = urlparse.urljoin(response.url, sku.get('url')) vr['skuId'] = sku.get('sku') price = is_empty( re.findall('\$([\d\.]+)', sku.get('attributesDescription', ''))) if price: vr['price'] = price name = sku.get('description', '') if name: vr['properties'] = {'title': name} vr['image_url'] = sku.get('thumbnailImageUrl').split('?')[0] variants.append(vr) product['variants'] = variants if product.get('variants') and self._extra_requests: variants_urls = [p.get('url') for p in product['variants']] for var_url in variants_urls: req = Request(url=var_url, callback=self._parse_in_stock_for_variants) req.meta['product'] = product reqs.append(req) if reqs: return self.send_next_request(reqs, response) return product # parse variants one by one and set out of stock status for each variant def _parse_in_stock_for_variants(self, response): reqs = response.meta['reqs'] product = response.meta['product'] oos = self._parse_is_out_of_stock(response) for variant in product['variants']: if variant['url'] == response.url: variant['in_stock'] = not oos break if reqs: return self.send_next_request(reqs, response) return product def parse_buyer_reviews(self, response): meta = response.meta.copy() reqs = meta['reqs'] self.br.br_count = meta['_br_count'] buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response) product = response.meta['product'] product['buyer_reviews'] = BuyerReviews(**buyer_reviews_per_page) if reqs: return self.send_next_request(reqs, response) return product def send_next_request(self, reqs, response): """ Helps to handle several requests """ req = reqs.pop(0) new_meta = response.meta.copy() if reqs: new_meta["reqs"] = reqs return req.replace(meta=new_meta) def parse_brand(self, response): brand = is_empty( response.xpath('//td[@itemprop="brand"]/@content').extract()) if not brand: brand = is_empty( response.xpath('//td[@itemprop="brand"]/text()').extract()) if brand: brand = brand.strip() return brand def parse_title(self, response): title = response.xpath( '//h1[contains(@itemprop, "name")]/text()').extract() return title def parse_data(self, response): data = re.findall(r'var MasterTmsUdo \'(.+)\'; ', response.body_as_unicode()) if data: data = re.sub(r'\\(.)', r'\g<1>', data[0]) try: js_data = json.loads(data) except: return return js_data def parse_image(self, response): img = response.xpath( '//img[contains(@id, "mainSkuProductImage")]/@src').extract() return img def parse_description(self, response): description = response.xpath( '//div[contains(@class, "sku_desc")]').extract() if description: return self.clear_text(description[0]) else: return '' def parse_sku(self, response): sku = response.xpath( '//td[contains(@id, "basicInfoManufacturerSku")]/text()').extract( ) # sku = response.xpath('//div[contains(@id, "skuValue")]/text()').extract() if sku: return self.clear_text(sku[0]) def parse_price(self, response): price = response.xpath( '//meta[contains(@itemprop, "price")]/@content').extract() currency = response.xpath( '//meta[contains(@itemprop, "priceCurrency")]/@content').extract() if price and currency: price = Price(price=price[0], priceCurrency=currency[0]) else: price = Price(price=0.00, priceCurrency="USD") return price def parse_paginate_link(self, response, nao): check_page = '&No=%s' % nao for link in response.xpath( '//a[contains(@class, "paging")]/@href').extract(): if check_page in link: u = urlparse.urlparse(link) return urlparse.urljoin('http://www.officedepot.com', u.path) def parse_category_link(self, response): categories_links = [] for link in response.xpath( '//div[contains(@class, "category_wrapper")]/a[contains(@class, "link")]/@href' ).extract(): categories_links.append(link) def _scrape_total_matches(self, response): totals = response.xpath( '//div[contains(@id, "resultCnt")]/text()').extract() if totals: totals = totals[0].replace(',', '').replace('.', '').strip() if totals.isdigit(): if not self.TOTAL_MATCHES: self.TOTAL_MATCHES = int(totals) return int(totals) def _scrape_product_links(self, response): items = response.xpath( '//div[contains(@class, "descriptionFull")]/' 'a[contains(@class, "med_txt")]/@href').extract() or response.css( '.desc_text a::attr("href")').extract() # Scraper was redirected to product page instead of search results page if not items and "officedepot.com/a/products" in response.url: prod = SiteProductItem(search_redirected_to_product=True) # TODO we may not need any data for product aside from "search_redirected_to_product" flag. # Rework if that's the case - CON-28287 req = Request(response.url, callback=self.parse_product, dont_filter=True) req.meta["remaining"] = 0 req.meta['product'] = prod yield req, prod else: for link in items: yield link, SiteProductItem() def _get_nao(self, url): nao = re.search(r'nao=(\d+)', url) if not nao: return return int(nao.group(1)) def _replace_nao(self, url, new_nao): current_nao = self._get_nao(url) if current_nao: return re.sub(r'nao=\d+', 'nao=' + str(new_nao), url) else: return url + '&nao=' + str(new_nao) def _scrape_next_results_page_link(self, response): if self.TOTAL_MATCHES is None: self.log('No "next result page" link!') # # TODO: check result by categories # return self.parse_category_link(response) return #if self.CURRENT_NAO > self.TOTAL_MATCHES+self.PAGINATE_BY: # return # all the products have been collected if self.CURRENT_NAO > self.quantity + self.PAGINATE_BY: return # num_products > quantity self.CURRENT_NAO += self.PAGINATE_BY if '/a/browse/' in response.url: # paginate in category or subcategory new_paginate_url = self.parse_paginate_link( response, self.CURRENT_NAO) if new_paginate_url: return Request(new_paginate_url, callback=self.parse, meta=response.meta, cookies=self.selenium_cookies) return Request(self.PAGINATE_URL.format( search_term=response.meta['search_term'], nao=str(self.CURRENT_NAO)), callback=self.parse, meta=response.meta, cookies=self.selenium_cookies)
class HomedepotProductsSpider(BaseValidator, BaseProductsSpider): name = 'homedepot_products' allowed_domains = ["homedepot.com", "origin.api-beta.homedepot.com"] start_urls = [] settings = HomedepotValidatorSettings SEARCH_URL = "http://www.homedepot.com/s/{search_term}?NCNI-5" DETAILS_URL = "http://www.homedepot.com/p/getSkuDetails?itemId=%s" REVIEWS_URL = "http://homedepot.ugc.bazaarvoice.com/1999m/%s/" \ "reviews.djs?format=embeddedhtml" RECOMMENDED_URL = "http://origin.api-beta.homedepot.com/ProductServices/v2/products/" \ "recommendation?type=json&key=tRXWvUBGuAwEzFHScjLw9ktZ0Bw7a335" product_filter = [] def __init__(self, *args, **kwargs): # All this is to set the site_name since we have several # allowed_domains. self.br = BuyerReviewsBazaarApi() super(HomedepotProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs) def _parse_single_product(self, response): return self.parse_product(response) @staticmethod def _parse_no_longer_available(response): message = response.xpath( '//div[@class="error" and ' 'contains(., "The product you are trying to view is not currently available.")]' ) return bool(message) def parse_product(self, response): product = response.meta['product'] product['_subitem'] = True if self._parse_no_longer_available(response): product['no_longer_available'] = True return product else: product['no_longer_available'] = False cond_set( product, 'title', response.xpath( "//h1[contains(@class, 'product-title')]/text()").extract()) brand = response.xpath("//h2[@itemprop='brand']/text()").extract() brand = ["".join(brand).strip()] cond_set(product, 'brand', brand) cond_set( product, 'image_url', response.xpath("//div[@class='product_mainimg']/img/@src |" "//img[@id='mainImage']/@src").extract()) cond_set( product, 'price', response.xpath("//div[@class='pricingReg']" "/span[@itemprop='price']/text()").extract()) reseller_id_regex = "\/(\d+)" reseller_id = re.findall(reseller_id_regex, response.url) reseller_id = reseller_id[0] if reseller_id else None cond_set_value(product, 'reseller_id', reseller_id) if product.get('price', None): if not '$' in product['price']: self.log('Unknown currency at' % response.url) else: product['price'] = Price(price=product['price'].replace( ',', '').replace('$', '').strip(), priceCurrency='USD') if not product.get('price'): price = response.xpath( "//div[@class='pricingReg']" "/span[@itemprop='price']/text() |" "//div[contains(@class, 'pricingReg')]/span[@itemprop='price']" ).re(FLOATING_POINT_RGEX) if price: product["price"] = Price(priceCurrency="USD", price=price[0]) try: product['model'] = response.css( '.product_details.modelNo ::text').extract()[0].replace( 'Model', '').replace('#', '').strip() except IndexError: pass internet_no = response.css('#product_internet_number ::text').extract() if internet_no: internet_no = internet_no[0] upc = is_empty(re.findall("ItemUPC=\'(\d+)\'", response.body)) if upc: product["upc"] = upc upc = response.xpath("//upc/text()").re('\d+') if upc: product["upc"] = upc[0] desc = response.xpath("//div[@id='product_description']" "/div[contains(@class,'main_description')]" "/descendant::*[text()]/text()" "//div[contains(@class, 'main_description')] |" "//div[@id='product_description']").extract() desc = " ".join(l.strip() for l in desc if len(l.strip()) > 0) product['description'] = desc product['locale'] = "en-US" metadata = response.xpath( "//script[contains(text(),'PRODUCT_METADATA_JSON')]" "/text()").re('var PRODUCT_METADATA_JSON = (.*);') skus = [] if metadata: metadata = metadata[0] jsmeta = hjson.loads(metadata) try: skus = [jsmeta["attributeDefinition"]["defaultSku"]] response.meta['skus'] = skus metaname = jsmeta['attributeDefinition']['attributeListing'][ 0]['label'] response.meta['attributelabel'] = metaname except (KeyError, IndexError): self.log("Incomplete data from Javascript.", DEBUG) certona_payload = self._gen_payload(response) if certona_payload: new_meta = response.meta.copy() new_meta['product'] = product new_meta['handle_httpstatus_list'] = [404, 415] new_meta['internet_no'] = internet_no headers = { 'Proxy-Connection': 'keep-alive', 'Content-Type': 'application/json' } return Request( self.RECOMMENDED_URL, callback=self._parse_related_products, headers=headers, body=json.dumps(certona_payload), method="POST", meta=new_meta, priority=1000, ) if internet_no: return Request( url=self.REVIEWS_URL % internet_no, callback=self.parse_buyer_reviews, meta={"product": product}, dont_filter=True, ) return self._gen_variants_requests(response, product, skus, internet_no) def _gen_variants_requests(self, response, product, skus, internet_no): reqs = [] for sku in skus: # if sku: # sku = sku[len(sku)-1] new_product = product.copy() new_meta = response.meta.copy() new_meta['product'] = new_product new_meta['handle_httpstatus_list'] = [404] new_meta['internet_no'] = internet_no url = self.DETAILS_URL % sku reqs.append( Request(url, self._parse_skudetails, meta=new_meta, priority=1000)) if not reqs: return product return reqs def _gen_payload(self, response): """Generates request body. Also maxProducts value can be changed for +\- number of values""" # changed version 4.2x -> 5.3x # appid = response.xpath("//input[@id='certona_appId']/@value").extract() # if not appid: # print "no appid" # return appid = 'homedepot01' critemid = response.xpath( "//input[@id='certona_critemId']/@value").extract() if not critemid: critemid = is_empty( re.findall("\"itemId\"\:\"(\d+)\"", response.body)) if not critemid: return payload = { "appId": appid, "products": critemid, "maxProducts": "16", "certonaSchema": "PIPHorizontal1_rr", "sessionId": "41020192309266", "trackingId": "252187705102752", "storeId": "123", } return payload def _parse_related_products(self, response): product = response.meta['product'] internet_no = response.meta.get('internet_no', None) if response.status in response.meta['handle_httpstatus_list']: # No further pages were found. Check the request payload. return product data = json.loads(response.body_as_unicode()) related_prods = [] for prod in data['schemas'][0]['products']: name = prod['productName'] href = prod['canonicalURL'] related_prods.append( RelatedProduct(name, urlparse.urljoin(product['url'], href))) if related_prods: if 'THE HOME DEPOT RECOMMENDS' in data['schemas'][0]['title']: product['related_products'] = {'recommended': related_prods} else: product['related_products'] = { 'buyers_also_bought': related_prods } skus = response.meta.get('skus', None) if not skus: if internet_no: return Request( url=self.REVIEWS_URL % internet_no, callback=self.parse_buyer_reviews, meta={"product": product}, dont_filter=True, ) return product return self._gen_variants_requests(response, product, skus, internet_no) def _parse_skudetails(self, response): product = response.meta['product'] try: jsdata = json.loads(response.body_as_unicode()) storeskus = jsdata['storeSkus'] price = storeskus['storeSku']['pricing']['originalPrice'] product['price'] = price if product.get('price', None): if not '$' in product['price']: self.log('Unknown currency at' % response.url) else: product['price'] = Price(price=product['price'].replace( ',', '').replace('$', '').strip(), priceCurrency='USD') desc = jsdata['info']['description'] product['description'] = desc url = jsdata['canonicalURL'] url = urlparse.urljoin(product['url'], url) product['url'] = url image = jsdata['inlinePlayerJSON']['IMAGE'][1]['mediaUrl'] product['image_url'] = image attrname = response.meta.get('attributelabel', 'Color/Finish') colornames = jsdata['attributeGroups']['group'][0]['entries'][ 'attribute'] colornames = [ el['value'] for el in colornames if el['name'] == attrname ] if colornames: product['model'] = str(colornames[0]) except (ValueError, KeyError, IndexError): self.log("Failed to parse SKU details.", DEBUG) internet_no = response.meta.get('internet_no', None) if internet_no: return Request( url=self.REVIEWS_URL % internet_no, callback=self.parse_buyer_reviews, meta={"product": product}, dont_filter=True, ) return product def parse_buyer_reviews(self, response): product = response.meta.get("product") brs = self.br.parse_buyer_reviews_per_page(response) self.br.br_count = brs.get('num_of_reviews', None) brs['rating_by_star'] = self.br.get_rating_by_star(response) product['buyer_reviews'] = brs return product def _scrape_total_matches(self, response): totals = response.xpath("//a[@id='all_products']/label" "/text()").re(r'All Products \(([\d,]+)\)') if totals: totals = totals[0] totals = totals.replace(",", "") if is_num(totals): return int(totals) no_matches = response.xpath( "//h1[@class='page-title']/text()").extract() if no_matches: if 'we could not find any' in no_matches[0] or \ 'we found 0 matches for' in no_matches[0]: return 0 total_matches = response.xpath( '//*[contains(@id, "all_products")]//text()').extract() if total_matches: total_matches = ''.join(total_matches) total_matches = ''.join(c for c in total_matches if c.isdigit()) if total_matches and total_matches.isdigit(): return int(total_matches) total_matches = response.xpath('//div[@id="allProdCount"]/text()').re( FLOATING_POINT_RGEX) if total_matches: total_matches = total_matches[0] total_matches = total_matches.replace(',', '') if total_matches.isdigit(): return int(total_matches) return def _scrape_product_links(self, response): links = response.xpath( "//div[contains(@class,'product') " "and contains(@class,'plp-grid')]" "//descendant::a[contains(@class, 'item_description')]/@href | " "//div[contains(@class, 'description')]/a[@data-pod-type='pr']/@href" ).extract() if not links: self.log("Found no product links.", DEBUG) for link in links: if link in self.product_filter: continue self.product_filter.append(link) yield link, SiteProductItem() def _scrape_next_results_page_link(self, response): next_page = response.xpath( "//div[@class='pagination-wrapper']/ul/li/span" "/a[@title='Next']/@href |" "//div[contains(@class, 'pagination')]/ul/li/span" "/a[@class='icon-next']/@href |" "//li[contains(@class, 'hd-pagination__item')]" "/a[contains(@class, 'pagination__link') and @title='Next']/@href" ).extract() if next_page: return urlparse.urljoin(response.url, next_page[0])
class PepboysProductsSpider(ProductsSpider): name = 'pepboys_products' allowed_domains = ['pepboys.com'] SEARCH_URL = "http://www.pepboys.com/s?query={search_term}" BUYER_REVIEWS_URL = ("https://pepboys.ugc.bazaarvoice.com/8514-en_us" "/{product_id}/reviews.djs?format=embeddedhtml") def __init__(self, *args, **kwargs): super(PepboysProductsSpider, self).__init__(*args, **kwargs) self.br = BuyerReviewsBazaarApi(called_class=self) def _total_matches_from_html(self, response): total = response.css('.resultCount::text').re('of (\d+) Result') return int(total[0].replace(',', '')) if total else 0 def _scrape_results_per_page(self, response): return 39 def _scrape_next_results_page_link(self, response): link = response.xpath('//a[@class="next"]/@href').extract() return link[0] if link else None def _scrape_product_links(self, response): item_urls = response.xpath( '//*[@class="product"]/a[1]/@href').extract() for item_url in item_urls: yield item_url, SiteProductItem() def _parse_single_product(self, response): return self.parse_product(response) def _parse_title(self, response): title = response.xpath( '//h4[contains(@class,"margin-top-none")]//text()').extract() title = [r.strip() for r in title if len(r.strip()) > 0] title = "".join(title) return title.strip() if title else None def _parse_categories(self, response): categories = response.xpath( '//*[@class="breadcrumb"]//li/a/text()').extract() return categories if categories else None def _parse_category(self, response): categories = self._parse_categories(response) return categories[-1] if categories else None def _parse_price(self, response): try: price = response.xpath( '//div[contains(@class,"subtotal")]//span[@class="price"]//text()' ).extract()[0].strip() price = re.findall(r'[\d\.]+', price) except: return None if not price: return None return Price(price=price[0], priceCurrency='USD') def _parse_image_url(self, response): image_url = response.xpath( '//img[contains(@class,"tdTireDetailImg")]/@src').extract() return image_url[0] if image_url else None def _parse_brand(self, response): brand = 'Pepboys' return brand.strip() if brand else None def _parse_sku(self, response): sku = response.xpath( '//div[contains(@class,"j-results-item-container")]/@data-sku' ).extract() return sku[0] if sku else None def _parse_variants(self, response): return None def _parse_is_out_of_stock(self, response): status = response.xpath( '//*[@id="availability"]/span[text()="In Stock"]') return not bool(status) def _parse_shipping_included(self, response): shipping_text = ''.join( response.xpath('//span[@class="free-shipping"]//text()').extract()) return shipping_text == ' & FREE Shipping' def _parse_description(self, response): description = response.xpath( '//div[contains(@class,"tdContentDesc")]').extract() return ''.join(description).strip() if description else None def _parse_buyer_reviews(self, response): str_review = response.xpath( '//div[@class="tsrSeeReviews"]//text()').extract() if str_review: num_of_reviews = re.findall(r'[\d]+', str_review) if len(num_of_reviews) > 0: num_of_reviews = num_of_reviews[0] buyer_reviews = { 'num_of_reviews': int(num_of_reviews), 'average_rating': float(average_rating), 'rating_by_star': rating_by_star } product = response.meta['product'] buyer_reviews = self.br.parse_buyer_reviews_per_page(response) product['buyer_reviews'] = buyer_reviews yield product def send_next_request(self, reqs, response): """ Helps to handle several requests """ req = reqs.pop(0) new_meta = response.meta.copy() if reqs: new_meta["reqs"] = reqs return req.replace(meta=new_meta) def parse_product(self, response): reqs = [] product = response.meta['product'] response.meta['product_response'] = response # Set locale product['locale'] = 'en_US' # Parse title title = self._parse_title(response) cond_set_value(product, 'title', title, conv=string.strip) # Parse category category = self._parse_category(response) cond_set_value(product, 'category', category) # Parse categories categories = self._parse_categories(response) cond_set_value(product, 'categories', categories) # Parse description description = self._parse_description(response) cond_set_value(product, 'description', description) # Parse price price = self._parse_price(response) cond_set_value(product, 'price', price) # Parse image url image_url = self._parse_image_url(response) cond_set_value(product, 'image_url', image_url) # Parse variants variants = self._parse_variants(response) cond_set_value(product, 'variants', variants) # Parse stock status out_of_stock = self._parse_is_out_of_stock(response) cond_set_value(product, 'is_out_of_stock', out_of_stock) # Sku sku = self._parse_sku(response) cond_set_value(product, 'sku', sku) # Brand brand = self._parse_brand(response) cond_set_value(product, 'brand', brand) # Shipping included shipping_included = self._parse_shipping_included(response) cond_set_value(product, 'shipping_included', shipping_included) # Custom reviews if sku: # Parse buyer reviews reqs.append( Request(url=self.BUYER_REVIEWS_URL.format(product_id=sku), dont_filter=True, callback=self.br.parse_buyer_reviews)) if reqs: return self.send_next_request(reqs, response) return product def _get_products(self, response): remaining = response.meta['remaining'] search_term = response.meta['search_term'] prods_per_page = response.meta.get('products_per_page') total_matches = response.meta.get('total_matches') scraped_results_per_page = response.meta.get( 'scraped_results_per_page') prods = self._scrape_product_links(response) if prods_per_page is None: # Materialize prods to get its size. prods = list(prods) prods_per_page = len(prods) response.meta['products_per_page'] = prods_per_page if scraped_results_per_page is None: scraped_results_per_page = self._scrape_results_per_page(response) if scraped_results_per_page: self.log( "Found %s products at the first page" % scraped_results_per_page, INFO) else: scraped_results_per_page = prods_per_page if hasattr(self, 'is_nothing_found'): if not self.is_nothing_found(response): self.log( "Failed to scrape number of products per page", ERROR) response.meta[ 'scraped_results_per_page'] = scraped_results_per_page if total_matches is None: total_matches = self._scrape_total_matches(response) if total_matches is not None: response.meta['total_matches'] = total_matches self.log("Found %d total matches." % total_matches, INFO) else: if hasattr(self, 'is_nothing_found'): if not self.is_nothing_found(response): self.log( "Failed to parse total matches for %s" % response.url, ERROR) if total_matches and not prods_per_page: # Parsing the page failed. Give up. self.log("Failed to get products for %s" % response.url, ERROR) return for i, (prod_url, prod_item) in enumerate(islice(prods, 0, remaining)): # Initialize the product as much as possible. prod_item['site'] = self.site_name prod_item['search_term'] = search_term prod_item['total_matches'] = total_matches prod_item['results_per_page'] = prods_per_page prod_item['scraped_results_per_page'] = scraped_results_per_page # The ranking is the position in this page plus the number of # products from other pages. prod_item['ranking'] = (i + 1) + (self.quantity - remaining) if self.user_agent_key not in ["desktop", "default"]: prod_item['is_mobile_agent'] = True if prod_url is None: # The product is complete, no need for another request. yield prod_item elif isinstance(prod_url, Request): cond_set_value(prod_item, 'url', prod_url.url) # Tentative. yield prod_url else: # Another request is necessary to complete the product. url = urlparse.urljoin(response.url, prod_url) cond_set_value(prod_item, 'url', url) # Tentative. yield Request( url, callback=self.parse_product, meta={'product': prod_item}, # Remove Referer field on searchs to make the # website display the breadcrumbs headers={'referer': ''}, )
class PetcoProductsSpider(ProductsSpider): name = 'petco_products' allowed_domains = ['petco.com'] SEARCH_URL = ("http://www.petco.com/shop/SearchDisplay?categoryId=&storeId" "=10151&catalogId=10051&langId=-1&sType=SimpleSearch&" "resultCatEntryType=2&showResultsPage=true&searchSource=Q&" "pageView=&beginIndex=0&pageSize=48&fromPageValue=search" "&searchTerm={search_term}") SEARCH_URL_2 = ("http://www.petco.com/shop/ProductListingView?searchType=" "12&filterTerm=&langId=-1&advancedSearch=&sType=Simple" "Search&resultCatEntryType=2&catalogId=10051&searchTerm=" "{search_term}&resultsPerPage=48&emsName=&facet=&category" "Id=&storeId=10151&beginIndex={begin_index}") REVIEW_URL = ("http://api.bazaarvoice.com/data/products.json?" "passkey=dpaqzblnfzrludzy2s7v27ehz&apiversion=5.5" "&filter=id:{product_id}&stats=reviews") PRICE_URL = "http://www.petco.com/shop/GetCatalogEntryDetailsByIDView" def __init__(self, *args, **kwargs): super(PetcoProductsSpider, self).__init__(*args, **kwargs) self.br = BuyerReviewsBazaarApi(called_class=self) self.product_last_page = 0 def parse_buyer_reviews(self, response): meta = response.meta.copy() product = response.meta['product'] reqs = meta.get('reqs', []) product['buyer_reviews'] = self.br.parse_buyer_reviews_products_json( response) if reqs: return self.send_next_request(reqs, response) else: return product def _total_matches_from_html(self, response): total = response.xpath( '//*[contains(@id,"searchTotalCount")]/text()').re('\d+') return int(total[0].replace(',', '')) if total else 0 def _scrape_results_per_page(self, response): return 48 def _scrape_next_results_page_link(self, response): # End of pagination if not self.product_last_page: return None begin_index = int(re.search('beginIndex=(\d+)', response.url).group(1)) num_poduct_page = self._scrape_results_per_page(response) st = response.meta['search_term'] url = self.url_formatter.format( self.SEARCH_URL_2, search_term=urllib.quote_plus(st.encode('utf-8')), begin_index=str(begin_index + num_poduct_page)) return url def _scrape_product_links(self, response): item_urls = response.xpath( '//*[@class="product-display-grid"]' '//*[@class="product-name"]/a/@href').extract() self.product_last_page = len(item_urls) for item_url in item_urls: yield item_url, SiteProductItem() def _parse_single_product(self, response): return self.parse_product(response) def _parse_title(self, response): title = response.xpath('//h1/text()').extract() return title[0].strip() if title else None def _parse_categories(self, response): categories = response.css('.breadcrumb a::text').extract() return categories if categories else None def _parse_category(self, response): categories = self._parse_categories(response) return categories[-1] if categories else None def _parse_image_url(self, response): image_url = response.xpath( '//*[@property="og:image"]/@content').extract() return image_url[0] if image_url else None def _parse_brand(self, response): brand = response.xpath('//*[@class="product-brand"]/a/text()').re( 'by.(.*)') return brand[0].strip() if brand else None def _parse_sku(self, response): sku = response.xpath("//input[@id='primarySku']/@value").extract() if len(sku[0]) < 1: sku = response.css('.product-sku::text').re(u'SKU:.(\d+)') return sku[0] if sku else None def _parse_variants(self, response): variants = [] try: variants_info = json.loads( response.xpath( '//*[contains(@id,"entitledItem_")]/text()').extract()[0]) except: variants_info = {} for attr_value in variants_info: attributes = {} variant_attribute = attr_value["Attributes"] attributes['price'] = attr_value["RepeatDeliveryPrice"]["price"] attributes['image_url'] = attr_value["ItemImage"] if variant_attribute: attr_text = attr_value["Attributes"].keys()[0].split('_') attributes[attr_text[0]] = attr_text[1] variants.append(attributes) return variants if variants else None def _parse_is_out_of_stock(self, response): status = response.xpath( '//*[@itemprop="availability" and @content="in_stock"]') return not bool(status) def _parse_shipping_included(self, response): pass def _parse_description(self, response): description = response.xpath('//*[@id="description"]').extract() return ''.join(description).strip() if description else None def send_next_request(self, reqs, response): """ Helps to handle several requests """ req = reqs.pop(0) new_meta = response.meta.copy() if reqs: new_meta["reqs"] = reqs return req.replace(meta=new_meta) def parse_product(self, response): reqs = [] product = response.meta['product'] # Set locale product['locale'] = 'en_US' # Parse title title = self._parse_title(response) cond_set_value(product, 'title', title, conv=string.strip) # Parse category category = self._parse_category(response) cond_set_value(product, 'category', category) # Parse categories categories = self._parse_categories(response) cond_set_value(product, 'categories', categories) # Parse description description = self._parse_description(response) cond_set_value(product, 'description', description) # Parse image url image_url = self._parse_image_url(response) cond_set_value(product, 'image_url', image_url) # Parse variants variants = self._parse_variants(response) cond_set_value(product, 'variants', variants) # Parse stock status out_of_stock = self._parse_is_out_of_stock(response) cond_set_value(product, 'is_out_of_stock', out_of_stock) # Sku sku = self._parse_sku(response) cond_set_value(product, 'sku', sku) # Reseller_id cond_set_value(product, 'reseller_id', sku) # Brand brand = self._parse_brand(response) cond_set_value(product, 'brand', brand) product_id = response.xpath( '//*[@id="productPartNo"]/@value').extract() if product_id: reqs.append( Request(url=self.REVIEW_URL.format(product_id=product_id[0], index=0), dont_filter=True, callback=self.parse_buyer_reviews, meta=response.meta)) price_id = response.xpath('//*[contains(@id,"entitledItem_")]/@id').re( 'entitledItem_(\d+)') cat_id = response.xpath('//script/text()').re( 'productDisplayJS.displayAttributeInfo\("(\d+)","(\d+)"') if not cat_id: cat_id = response.xpath( '//*[@name="firstAvailableSkuCatentryId_avl"]/@value').extract( ) if price_id and cat_id: text = ("storeId=10151&langId=-1&catalogId=10051&" "catalogEntryId={cat}&productId={prod_id}".format( cat=cat_id[0], prod_id=price_id[0])) reqs.append( Request(self.PRICE_URL, body=text, headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'X-Requested-With': 'XMLHttpRequest' }, method='POST', meta=response.meta, callback=self._parse_price, dont_filter=True)) else: prices = map( float, response.xpath('//*[@class="product-price"]//span/text()').re( '\$([\d\.]+)')) product['price'] = Price(price=min(prices), priceCurrency="USD") if reqs: return self.send_next_request(reqs, response) return product def _parse_price(self, response): reqs = response.meta.get('reqs', []) product = response.meta['product'] raw_information = re.findall('\{.*\}', response.body, re.MULTILINE | re.DOTALL)[0] product_data = eval(raw_information) price = product_data["catalogEntry"]["offerPrice"] product['price'] = Price(price=price, priceCurrency="USD") if reqs: return self.send_next_request(reqs, response) return product
class NikeProductSpider(BaseProductsSpider): name = 'nike_products' allowed_domains = ["nike.com"] SEARCH_URL = "http://nike.com/#{search_term}" REVIEW_URL = "http://nike.ugc.bazaarvoice.com/9191-en_us/{product_model}" \ "/reviews.djs?format=embeddedhtml" #handle_httpstatus_list = [404, 403, 429] use_proxies = False # we'll be using Crawlera instead def __init__(self, sort_mode=None, *args, **kwargs): from scrapy.conf import settings settings.overrides['DEPTH_PRIORITY'] = 1 settings.overrides[ 'SCHEDULER_DISK_QUEUE'] = 'scrapy.squeue.PickleFifoDiskQueue' settings.overrides[ 'SCHEDULER_MEMORY_QUEUE'] = 'scrapy.squeue.FifoMemoryQueue' settings.overrides['CRAWLERA_ENABLED'] = True self.quantity = kwargs.get('quantity', 1000) # default is 1000 self.proxy = 'content.crawlera.com:8010' self.proxy_type = 'http' #self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0' self.user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A' self.br = BuyerReviewsBazaarApi(called_class=self) super(NikeProductSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs) @staticmethod def _get_antiban_headers(): return { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0', 'Connection': 'keep-alive', 'Accept-Language': 'en-US,en;q=0.5', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate' } def start_requests(self): for st in self.searchterms: yield Request(self.url_formatter.format( self.SEARCH_URL, search_term=st.encode('utf-8'), ), meta={ 'search_term': st, 'remaining': self.quantity }, headers=self._get_antiban_headers()) if self.product_url: prod = SiteProductItem() prod['is_single_result'] = True meta = {} meta['is_product_page'] = True meta['product'] = prod yield Request(self.product_url, self._parse_single_product, meta=meta, headers=self._get_antiban_headers()) def _init_firefox(self): from selenium import webdriver from selenium.webdriver.remote.remote_connection import RemoteConnection RemoteConnection.set_timeout(30) profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", self.user_agent) profile.set_preference('intl.accept_languages', 'en-US') profile.set_preference("network.proxy.type", 1) # manual proxy configuration profile.set_preference('permissions.default.image', 2) if self.proxy: profile.set_preference("network.http.phishy-userpass-length", 255) if 'socks' in self.proxy_type: profile.set_preference("network.proxy.socks", self.proxy.split(':')[0]) profile.set_preference("network.proxy.socks_port", int(self.proxy.split(':')[1])) else: profile.set_preference("network.proxy.http", self.proxy.split(':')[0]) profile.set_preference("network.proxy.http_port", int(self.proxy.split(':')[1])) profile.update_preferences() driver = webdriver.Firefox(profile) driver.set_window_size(1280, 1024) driver.set_page_load_timeout(60) driver.set_script_timeout(60) return driver def _parse_single_product(self, response): return self.parse_product(response) def _get_product_links_from_serp(self, driver): results = [] links = driver.find_elements_by_xpath( '//*[contains(@class, "grid-item-image")]' '//a[contains(@href, "/pd/") or contains(@href, "/product/")]') for l in links: href = l.get_attribute('href') if href: if not href.startswith('http'): href = urlparse.urljoin( 'http://' + self.allowed_domains[0], href) results.append(href) return results def _is_product_page(self, response): return 'is_product_page' in response.meta @staticmethod def _get_proxy_ip(driver): driver.get('http://icanhazip.com') ip = re.search('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', driver.page_source) if ip: ip = ip.group(1) return ip @staticmethod def _auth_firefox_proxy(driver): driver.set_page_load_timeout(10) try: driver.get('http://icanhazip.com') except: from selenium.webdriver.common.alert import Alert time.sleep(3) alert = Alert(driver) time.sleep(3) #alert.authenticate(CRAWLERA_APIKEY, '') alert.send_keys(CRAWLERA_APIKEY + '\n') alert.accept() #alert.send_keys('\t') #alert.send_keys('\n') #import pdb; pdb.set_trace() driver.set_page_load_timeout(30) @staticmethod def last_five_digits_the_same(lst): if len(lst) < 6: return return lst[-1] == lst[-2] == lst[-3] == lst[-4] == lst[-5] def _reliable_get(self, driver, url, max_attempts=40, check_element='title'): """ Acts like driver.get() but with failsafety """ driver.set_page_load_timeout(30) driver.set_script_timeout(30) for i in range(max_attempts): try: driver.get(url) if driver.find_elements_by_xpath('//%s' % check_element): return driver except: self.log('_reliable_get error #%i while getting url %s' % (i, url)) self.log('_reliable_get failed to get url %s' % url, ERROR) def parse(self, response): if not self._is_product_page(response): display = Display(visible=0, size=(1024, 768)) display.start() product_links = [] # scrape "quantity" products driver = self._init_firefox() self._auth_firefox_proxy(driver) if self.proxy: ip_via_proxy = NikeProductSpider._get_proxy_ip(driver) print 'IP via proxy:', ip_via_proxy self.log('IP via proxy: %s' % ip_via_proxy) try: self._reliable_get(driver, 'http://store.nike.com/us/en_us') except Exception as e: print(str(e)) self.log(str(e), WARNING) driver.find_element_by_name('searchList').send_keys( self.searchterms[0] + '\n') time.sleep(6) # let AJAX finish new_meta = response.meta.copy() # get all products we need (scroll down) collected_products_len = [] num_exceptions = 0 while 1: try: driver.execute_script("scrollTo(0,50000)") time.sleep(10) product_links = self._get_product_links_from_serp(driver) collected_products_len.append(len(product_links)) print 'Collected %i product links' % len(product_links) self.log('Collected %i product links' % len(product_links)) if len(product_links) > self.quantity: break if self.last_five_digits_the_same(collected_products_len): break # last five iterations collected equal num of products except Exception as e: print str(e) self.log('Exception while scrolling page: %s' % str(e), WARNING) num_exceptions += 1 if num_exceptions > 10: self.log('Maximum number of exceptions reached', ERROR) driver.quit() display.stop() return for i in xrange(10): time.sleep(3) try: selenium_cookies = driver.get_cookies() break except Exception as e: print('Exception while loading cookies %s attempt %i' % (str(e), i)) self.log('Exception while loading cookies %s attempt %i' % (str(e), i)) try: driver.quit() display.stop() except: pass #driver.save_screenshot('/tmp/1.png') new_meta['is_product_page'] = True new_meta['proxy'] = self.proxy for i, product_link in enumerate(product_links): new_meta['_ranking'] = i + 1 yield Request(product_link, meta=new_meta, callback=self.parse_product, headers=self._get_antiban_headers()) #cookies=selenium_cookies) def parse_product(self, response): meta = response.meta.copy() product = meta.get('product', SiteProductItem()) product['_subitem'] = True _ranking = response.meta.get('_ranking', None) product['ranking'] = _ranking product['url'] = response.url product['search_term'] = response.meta.get('search_term', None) # product data in json js_data = self.parse_data(response) # product id product_id = self.parse_product_id(response, js_data) product_color = self.parse_product_color(response, js_data) product_price = 0 # Parse product_id title = self.parse_title(response, js_data) cond_set_value(product, 'title', title) if not product.get('title', None): return # Parse locate locale = 'en_US' cond_set_value(product, 'locale', locale) # Parse model product_model = self.parse_product_model(response) cond_set_value(product, 'model', product_model) # Parse image image = self.parse_image(response, js_data) cond_set_value(product, 'image_url', image) # Parse reseller_id reseller_id = self.parse_reseller_id(response) cond_set_value(product, "reseller_id", reseller_id) # Parse brand # brand = self.parse_brand(response) # cond_set_value(product, 'brand', brand) # Parse upc # upc = self.parse_upc(response) # cond_set_value(product, 'upc', upc) # Parse sku sku = self.parse_sku(response) cond_set_value(product, 'sku', sku) # Parse description description = self.parse_description(response) cond_set(product, 'description', description) # Parse price price = self.parse_price(response, js_data) cond_set_value(product, 'price', price) # Parse variants nv = NikeVariants() nv.setupSC(response) try: product['variants'] = nv._variants() except: # "/product/" urls that are non-standard and not supported (yet)? pass meta['product'] = product # parse buyer reviews yield Request(url=self.REVIEW_URL.format(product_model=product_model), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta) yield product def parse_reseller_id(self, response): regex = "\/pid-(\d+)" reseller_id = re.findall(regex, response.url) reseller_id = reseller_id[0] if reseller_id else None return reseller_id def parse_count_reviews(self, response): count_review = response.xpath( '//meta[contains(@itemprop, "reviewCount")]/@content').extract() if count_review: return int(count_review[0]) else: return 0 def parse_data(self, response): script_data = response.xpath( '//script[contains(@id, "product-data")]/text()').extract() try: js_data = json.loads(script_data[0]) return js_data except: return def parse_image(self, response, js_data): if js_data: try: image = js_data['imagesHeroLarge'][0] return image except: return def parse_description(self, response): # js_data['content'] desc = response.xpath( '//div[contains(@class, "pi-pdpmainbody")]').extract() return desc def parse_sku(self, response): skuid = response.xpath( '//span[contains(@class, "exp-style-color")]/text()').extract() if skuid: return skuid[0].replace('Style: ', '') def parse_price(self, response, js_data): if js_data: try: currency = js_data['crossSellConfiguration']['currency'] except KeyError: currency = "USD" try: price = js_data['rawPrice'] self.product_price = price except KeyError: price = 0.00 if price and currency: price = Price(price=price, priceCurrency=currency) else: price_og = re.search( '<meta property="og:price:amount" content="([\d\.]+)" />', response.body_as_unicode()) if price_og: return Price(price=float(price_og.group(1)), priceCurrency="USD") price = Price(price=0.00, priceCurrency="USD") return price def _scrape_total_matches(self, response): totals = response.css('.productCount ::text').extract() if totals: totals = totals[0].replace(',', '').replace('.', '').strip() if totals.isdigit(): if not self.TOTAL_MATCHES: self.TOTAL_MATCHES = int(totals) return int(totals) def _scrape_product_links(self, response): for link in response.xpath( '//li[contains(@class, "product-tile")]' '//a[contains(@rel, "product")]/@href').extract(): yield link, SiteProductItem() def parse_product_id(self, response, js_data): if js_data: try: product_id = js_data['productId'] return product_id except: return def parse_product_model(self, response): model = response.xpath( '//div[contains(@class, "hero-product-style-color-info")]/@data-stylenumber' ).extract() return model[0] if model else None def parse_product_color(self, response, js_data): if js_data: try: product_color = js_data['colorDescription'] return product_color except: return def parse_title(self, response, js_data): if js_data: try: title = js_data['productTitle'] return title except: return def parse_buyer_reviews(self, response): buyer_reviews_per_page = self.br.parse_buyer_reviews_per_page(response) product = response.meta['product'] product['buyer_reviews'] = BuyerReviews(**buyer_reviews_per_page) yield product
def __init__(self, *args, **kwargs): super(PetcoProductsSpider, self).__init__(*args, **kwargs) self.br = BuyerReviewsBazaarApi(called_class=self) self.product_last_page = 0
class TopshopProductsSpider(ProductsSpider): name = 'topshop_products' allowed_domains = ['topshop.com'] SEARCH_URL = "http://us.topshop.com/webapp/wcs/stores/servlet/CatalogNavigationSearchResultCmd?" \ "langId=-1&storeId=13052&catalogId=33060&Dy=1&Nty=1&beginIndex=1&pageNum=1&Ntt={search_term}" _REVIEWS_URL = 'http://topshop.ugc.bazaarvoice.com/6025-en_us/{sku}/reviews.djs?format=embeddedhtml' def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) super(TopshopProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs) def _total_matches_from_html(self, response): total = response.xpath('(//*[@class="pager"]//*[@class="amount"]' '/text())[1]').re('of (\d+)') return int(total[0]) if total else 0 def _scrape_results_per_page(self, response): results_per_page = response.xpath( '//*[@class="limiter"]//option[@selected]/text()').re('\d+') return int(results_per_page[0]) if results_per_page else 0 def _scrape_next_results_page_link(self, response): link = response.xpath('//a[@title="Next"]/@href').extract() return link[0] if link else None def _scrape_product_links(self, response): item_urls = response.xpath( '//*[@class="product-name"]/a/@href').extract() for item_url in item_urls: yield item_url, SiteProductItem() def _parse_single_product(self, response): return self.parse_product(response) def _parse_title(self, response): title = response.xpath('//*[@itemprop="name"]/text()').extract() return title[0] if title else None def _parse_category(self, response): categories = response.xpath( '//*[@id="nav_breadcrumb"]//li//a//span//text()').extract() return categories[-1] if categories else None def _parse_price(self, response): price = response.xpath( '//div[contains(@class,"product_details")]' '//div[contains(@class,"product_prices")]//span//text()').extract( ) if len(price) > 1: price = price[1] if "$" in price: currency = 'USD' else: currency = '' price = re.findall(r'[\d\.]+', price) if len(price) == 0: return None return Price(price=price[0], priceCurrency=currency) def _parse_image_url(self, response): image_url = response.xpath( '//ul[contains(@class,"product_hero__wrapper")]' '//a[contains(@class,"hero_image_link")]//img/@src').extract() return image_url[0] if image_url else None def _parse_variants(self, response): return None def _parse_is_out_of_stock(self, response): status = response.xpath('//*[@itemprop="availability" ' 'and not(@href="http://schema.org/InStock")]') return bool(status) def _parse_description(self, response): description = response.xpath( '//div[@id="productInfo"]//p//text()').extract() return ''.join(description).strip() if description else None def _parse_buyer_reviews(self, response): meta = response.meta.copy() product = response.meta['product'] reqs = meta.get('reqs', []) product['buyer_reviews'] = self.br.parse_buyer_reviews_per_page( response) if reqs: return self.send_next_request(reqs, response) else: return product def clear_text(self, str_result): return str_result.replace("\t", "").replace("\n", "").replace("\r", "").strip() def _parse_buyer_reviews(self, response): product = response.meta['product'] reqs = response.meta.get('reqs', []) content = re.search('BVRRRatingSummarySourceID":"(.+?)\},', response._body).group(1).replace('\\"', '"') content = content.replace("\\/", "/") review_html = html.fromstring(content) arr = review_html.xpath( '//div[contains(@class,"BVRRQuickTakeSection")]' '//div[contains(@class,"BVRRRatingOverall")]' '//img[contains(@class,"BVImgOrSprite")]/@title') if len(arr) > 0: average_rating = float(arr[0].strip().split(" ")[0]) else: average_rating = 0.0 arr = review_html.xpath( '//div[contains(@class,"BVRRReviewDisplayStyle5")]' '//div[contains(@class,"BVRRReviewDisplayStyle5Header")]' '//span[@itemprop="ratingValue"]//text()') num_of_reviews = len(arr) review_list = [[5 - i, arr.count(str(5 - i))] for i in range(5)] if review_list: # average score sum = 0 cnt = 0 for i, review in review_list: sum += review * i cnt += review # average_rating = float(sum)/cnt # number of reviews num_of_reviews = 0 for i, review in review_list: num_of_reviews += review else: pass rating_by_star = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} for i, review in review_list: rating_by_star[i] = review if average_rating and num_of_reviews: product["buyer_reviews"] = BuyerReviews( num_of_reviews=int(num_of_reviews), average_rating=float(average_rating), rating_by_star=rating_by_star, ) else: product["buyer_reviews"] = ZERO_REVIEWS_VALUE if reqs: return self.send_next_request(reqs, response) return product def _parse_last_buyer_date(self, response): last_review_date = response.xpath( '//*[contains(@class,"box-reviews")]' '//*[@class="date"]/text()').re('Posted on (.*)\)') return last_review_date[0] if last_review_date else None def _parse_sku(self, response): sku = response.xpath( '//div[@id="productInfo"]//li[contains(@class,"product_code")]//span//text()' ).extract() return sku[0] if sku else None def _parse_brand(self, response, title): brand = response.xpath('.//*[contains(text(), "Shop all")]/text()').re( r'Shop\sall\s+(\S+)\s?') brand = brand[0].strip() if brand else None if not brand: try: brand = guess_brand_from_first_words(title) except: brand = None return brand def parse_product(self, response): reqs = [] meta = response.meta.copy() product = meta['product'] # Set locale product['locale'] = 'en_US' # Parse title title = self._parse_title(response) cond_set_value(product, 'title', title, conv=string.strip) # Parse category category = self._parse_category(response) cond_set_value(product, 'category', category) # Parse description description = self._parse_description(response) cond_set_value(product, 'description', description) # Parse price price = self._parse_price(response) cond_set_value(product, 'price', price) # Parse sku sku = self._parse_sku(response) cond_set_value(product, 'sku', sku) # Parse image url image_url = self._parse_image_url(response) cond_set_value(product, 'image_url', image_url) # Parse variants variants = self._parse_variants(response) cond_set_value(product, 'variants', variants) # Parse stock status out_of_stock = self._parse_is_out_of_stock(response) cond_set_value(product, 'is_out_of_stock', out_of_stock) # Parse brand brand = self._parse_brand(response, product.get('title')) cond_set_value(product, 'brand', brand) # Parse last buyer review date last_buyer_date = self._parse_last_buyer_date(response) cond_set_value(product, 'last_buyer_review_date', last_buyer_date) # Parse reviews reqs.append( Request(url=self._REVIEWS_URL.format(sku=product['sku']), dont_filter=True, callback=self._parse_buyer_reviews, meta=meta)) if reqs: return self.send_next_request(reqs, response) return product def send_next_request(self, reqs, response): """ Helps to handle several requests """ req = reqs.pop(0) new_meta = response.meta.copy() if reqs: new_meta["reqs"] = reqs return req.replace(meta=new_meta)
class RiteAidProductsSpider(ProductsSpider): name = 'riteaid_products' allowed_domains = ['riteaid.com'] SEARCH_URL = "https://shop.riteaid.com/catalogsearch/result/"\ "?limit=72&q={search_term}" _REVIEWS_URL='http://api.bazaarvoice.com/data/reviews.json?apiversion=5.5&passkey=tezax0lg4cxakub5hhurfey5o&' \ 'Filter=ProductId:{sku}&Include=Products&Stats=Reviews' def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) super(RiteAidProductsSpider, self).__init__( site_name=self.allowed_domains[0], *args, **kwargs) def _total_matches_from_html(self, response): total = response.xpath( '(//*[@class="pager"]//*[@class="amount"]' '/text())[1]').re('of (\d+)') return int(total[0]) if total else 0 def _scrape_results_per_page(self, response): results_per_page = response.xpath( '//*[@class="limiter"]//option[@selected]/text()').re('\d+') return int(results_per_page[0]) if results_per_page else 0 def _scrape_next_results_page_link(self, response): link = response.xpath('//a[@title="Next"]/@href').extract() return link[0] if link else None def _scrape_product_links(self, response): item_urls = response.xpath( '//*[@class="product-name"]/a/@href').extract() for item_url in item_urls: yield item_url, SiteProductItem() def _parse_single_product(self, response): return self.parse_product(response) def _parse_title(self, response): title = response.xpath('//*[@itemprop="name"]/text()').extract() return title[0] if title else None def _parse_category(self, response): categories = response.xpath( '(//a[@property="v:title"]/text())[position()>1]').extract() return categories[-1] if categories else None def _parse_price(self, response): price = response.xpath('//*[@itemprop="price"]/text()').re('[\d\.]+') currency = response.xpath( '//*[@itemprop="priceCurrency"]/@content').re('\w{2,3}') or ['USD'] if not price: return None return Price(price=price[0], priceCurrency=currency[0]) def _parse_image_url(self, response): image_url = response.xpath('//*[@itemprop="image"]/@src').extract() return image_url[0] if image_url else None def _parse_variants(self, response): return None def _parse_is_out_of_stock(self, response): status = response.xpath( '//*[@itemprop="availability" ' 'and not(@href="http://schema.org/InStock")]') return bool(status) def _parse_description(self, response): description = response.xpath( '(//*[@id="collateral-tabs"]//*[@class="tab-container"])[1]' '//*[self::p or self::ul or self::table] | ' '(//*[@id="collateral-tabs"]//*[@class="tab-container"])[1]' '//*[@class="std"]/text()').extract() return ''.join(description).strip() if description else None def _parse_buyer_reviews(self, response): meta = response.meta.copy() product = response.meta['product'] reqs = meta.get('reqs', []) product['buyer_reviews'] = self.br.parse_buyer_reviews_per_page(response) if reqs: return self.send_next_request(reqs, response) else: return product def _parse_buyer_reviews(self, response): contents = response.body_as_unicode() meta = response.meta.copy() product = response.meta['product'] reqs = meta.get('reqs', []) buyer_reviews = {} sku = product.get('sku') if not product.get('buyer_reviews'): contents = json.loads(contents) incl = contents.get('Includes') brs = incl.get('Products').get(sku) if incl else None if brs: by_star = {} for d in brs['ReviewStatistics']['RatingDistribution']: by_star[str(d['RatingValue'])] = d['Count'] for sc in range(1, 6): if str(sc) not in by_star: by_star[str(sc)] = 0 buyer_reviews['rating_by_star'] = by_star review_count = brs['ReviewStatistics']['TotalReviewCount'] if review_count == 0: product['buyer_reviews'] = ZERO_REVIEWS_VALUE return product buyer_reviews['num_of_reviews'] = review_count average_review = brs['ReviewStatistics']['AverageOverallRating'] average_review = float(format(average_review, '.2f')) buyer_reviews['average_rating'] = average_review product['buyer_reviews'] = BuyerReviews(**buyer_reviews) else: product['buyer_reviews'] = ZERO_REVIEWS_VALUE if not product.get('buyer_reviews'): product['buyer_reviews'] = ZERO_REVIEWS_VALUE if reqs: return self.send_next_request(reqs, response) return product def _parse_last_buyer_date(self, response): last_review_date = response.xpath( '//*[contains(@class,"box-reviews")]' '//*[@class="date"]/text()').re('Posted on (.*)\)') return last_review_date[0] if last_review_date else None def _parse_sku(self, response): sku = response.xpath('.//*[@itemprop="sku"]/@content').extract() return sku[0] if sku else None def _parse_brand(self, response, title): brand = response.xpath('.//*[contains(text(), "Shop all")]/text()').re(r'Shop\sall\s+(\S+)\s?') brand = brand[0].strip() if brand else None if not brand: brand = guess_brand_from_first_words(title) return brand def parse_product(self, response): reqs = [] meta = response.meta.copy() product = meta['product'] # Set locale product['locale'] = 'en_US' # Parse title title = self._parse_title(response) cond_set_value(product, 'title', title, conv=string.strip) # Parse category category = self._parse_category(response) cond_set_value(product, 'category', category) # Parse description description = self._parse_description(response) cond_set_value(product, 'description', description) # Parse price price = self._parse_price(response) cond_set_value(product, 'price', price) # Parse sku sku = self._parse_sku(response) cond_set_value(product, 'sku', sku) # Parse reseller_id cond_set_value(product, 'reseller_id', reseller_id) # Parse image url image_url = self._parse_image_url(response) cond_set_value(product, 'image_url', image_url) # Parse variants variants = self._parse_variants(response) cond_set_value(product, 'variants', variants) # Parse stock status out_of_stock = self._parse_is_out_of_stock(response) cond_set_value(product, 'is_out_of_stock', out_of_stock) # Parse brand brand = self._parse_brand(response, product.get('title')) cond_set_value(product, 'brand', brand) # Parse last buyer review date last_buyer_date = self._parse_last_buyer_date(response) cond_set_value(product, 'last_buyer_review_date', last_buyer_date) # Parse reviews reqs.append( Request( url=self._REVIEWS_URL.format(sku=product['sku']), dont_filter=True, callback=self._parse_buyer_reviews, meta=meta )) if reqs: return self.send_next_request(reqs, response) return product def send_next_request(self, reqs, response): """ Helps to handle several requests """ req = reqs.pop(0) new_meta = response.meta.copy() if reqs: new_meta["reqs"] = reqs return req.replace(meta=new_meta)
class OrientaltradingProductsSpider(BaseProductsSpider): name = 'orientaltrading_products' allowed_domains = ['orientaltrading.com', "www.orientaltrading.com"] start_urls = [] SEARCH_URL = "http://www.orientaltrading.com/web/search/searchMain?Ntt={search_term}" PAGINATE_URL = "http://www.orientaltrading.com/web/search/searchMain?Nrpp=64&No={nao}&Ntt={search_term}" CURRENT_NAO = 0 PAGINATE_BY = 64 # 64 products TOTAL_MATCHES = None # for pagination REVIEW_URL = "http://orientaltrading.ugc.bazaarvoice.com/0713-en_us/{product_id}" \ "/reviews.djs?format=embeddedhtml&page={index}&" VARIANT_PRODUCT = 'http://www.orientaltrading.com/web/browse/processProductsCatalog' #use_proxies = True def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) super(OrientaltradingProductsSpider, self).__init__(site_name=self.allowed_domains[0], *args, **kwargs) def _parse_single_product(self, response): return self.parse_product(response) def parse_product(self, response): meta = response.meta.copy() product = meta.get('product', SiteProductItem()) reqs = [] meta['reqs'] = reqs # Parse locate locale = 'en_US' cond_set_value(product, 'locale', locale) # Parse title title = self.parse_title(response) cond_set(product, 'title', title) # Parse image image = self.parse_image(response) cond_set(product, 'image_url', image) # Parse sku sku = self.parse_sku(response) cond_set_value(product, 'sku', sku) # Parse reseller_id cond_set_value(product, "reseller_id", sku) # Parse price price = self.parse_price(response) cond_set_value(product, 'price', price) # Parse description description = self.parse_description(response) cond_set(product, 'description', description) product['related_products'] = self.parse_related_product(response) otv = OrientaltradingVariants() otv.setupSC(response) _variants = otv._variants() if _variants: product['variants'] = _variants # reqs = self.parse_variants(response, reqs) # Parse reviews reqs.append( Request(url=self.REVIEW_URL.format( product_id=product['sku'].replace('/', '_'), index=0), dont_filter=True, callback=self.parse_buyer_reviews, meta=meta)) if reqs: return self.send_next_request(reqs, response) else: return product def clear_text(self, str_result): return str_result.replace("\t", "").replace("\n", "").replace( "\r", "").replace(u'\xa0', ' ').strip() def send_next_request(self, reqs, response): """ Helps to handle several requests """ req = reqs.pop(0) new_meta = response.meta.copy() if reqs: new_meta["reqs"] = reqs return req.replace(meta=new_meta) def parse_related_product(self, response): related_prods = [] urls = response.xpath( '//div[contains(@class, "ymal-content-wrapper")]/p/a/@href' ).extract() titles = response.xpath( '//div[contains(@class, "ymal-content-wrapper")]/p/a/text()' ).extract() # Title for title, url in zip(titles, urls): if url and title: related_prods.append(RelatedProduct(title=title, url=url)) related_products = {} if related_prods: related_products['you may also like'] = related_prods return related_products def parse_title(self, response): title = response.xpath( '//meta[contains(@property, "og:title")]/@content').extract() return title def parse_image(self, response): img = response.xpath( '//meta[contains(@property, "og:image")]/@content').extract() return img def parse_description(self, response): description = response.xpath( '//div[contains(@class, "pd-text-bloc")] | //p[contains(@class, "pd-text-bloc")]' ).extract() if description: return description else: return '' def parse_sku(self, response): sku = response.xpath( '//input[contains(@id, "productsku")]/@value').extract() if sku: return sku[0] def parse_productid(self, response): model = response.xpath( '//input[contains(@id, "productId")]/@value').extract() if model: return model[0] def parse_price(self, response): price = response.xpath( '//p[contains(@id, "pd-price")]/text()').extract() if price: price = self.clear_text(price[0].replace('NOW', '').replace('$', '')) return Price(price=price, priceCurrency="USD") else: return Price(price=0.00, priceCurrency="USD") """ def parse_variants(self, response, reqs): select_variants = response.xpath('//fieldset[contains(@class, "select-options")]/select') if select_variants: OTC_CSRFTOKEN = response.xpath('//input[contains(@name, "OTC_CSRFTOKEN")]/@value').extract() prefix = response.xpath('//input[contains(@id, "prefix")]/@value').extract() productId = response.xpath('//input[contains(@id, "productId")]/@value').extract() parentSku = response.xpath('//input[contains(@id, "parentSku")]/@value').extract() demandPrefix = response.xpath('//input[contains(@id, "demandPrefix")]/@value').extract() pznComponentIndex = response.xpath('//input[contains(@id, "pznComponentIndex")]/@value').extract() pznHiddenData = response.xpath('//input[contains(@id, "pznHiddenData")]/@value').extract() pznImageName = response.xpath('//input[contains(@id, "pznImageName")]/@value').extract() destinationDisplayJSP = response.xpath('//input[contains(@name, "destinationDisplayJSP")]/@value').extract() requestURI = response.xpath('//input[contains(@name, "requestURI")]/@value').extract() numberOfAttributes = response.xpath('//input[contains(@id, "numberOfAttributes")]/@value').extract() categoryId = response.xpath('//input[contains(@id, "categoryId")]/@value').extract() mode = response.xpath('//input[contains(@id, "mode")]/@value').extract() quantity = response.xpath('//input[contains(@name, "quantity")]/@value').extract() params = {'OTC_CSRFTOKEN': OTC_CSRFTOKEN[0], 'categoryId': categoryId[0], 'demandPrefix': demandPrefix[0], 'destinationDisplayJSP': destinationDisplayJSP[0], 'mode': mode[0], 'numberOfAttributes': numberOfAttributes[0], 'parentSku': parentSku[0], 'prefix': prefix[0], 'productId': productId[0], 'pznComponentIndex': pznComponentIndex[0], 'pznHiddenData': pznHiddenData[0], 'pznImageName': pznImageName[0], 'quantity': quantity[0], 'requestURI': requestURI[0], 'sku': '', } for v in select_variants: name = v.xpath('@name').extract() options = v.xpath('option/@value').extract() for opt in options: if opt: # TODO: get variant sku for params['sku'] # url = 'http://www.orientaltrading.com/rest/ajax/' # post_data = {'formData': "{\"sku\":\"%s\",\"uniqueIdentifier\":\"\",\"nameArray\":[\"%s\"]," # "\"valueArray\":[\"%s\"],\"command\":\"AttributeSkuLookup\"}" % (sku, name[0], opt[0]), # 'requestURI': "/" # } # reqs.append(FormRequest(url=url, formdata=post_data, callback=self.get_sku_attribute)) params[name[0]] = opt reqs.append(FormRequest(url=self.VARIANT_PRODUCT, formdata=params, callback=self.parse_variants_info)) return reqs """ """ def parse_variants_info(self, response): meta = response.meta.copy() product = response.meta['product'] reqs = meta.get('reqs', []) sku = self.parse_sku(response) price = self.parse_price(response) if reqs: return self.send_next_request(reqs, response) else: return product """ def get_sku_attribute(self, response): meta = response.meta.copy() product = response.meta['product'] reqs = meta.get('reqs', []) jsondata = json.loads(response.body_as_unicode()) # {"uniqueIdentifier":"","parentSku":"13578611","attributeSku":"13582742"} new_sku = jsondata['attributeSku'] if reqs: return self.send_next_request(reqs, response) else: return product def parse_buyer_reviews(self, response): meta = response.meta.copy() product = response.meta['product'] reqs = meta.get('reqs', []) product['buyer_reviews'] = self.br.parse_buyer_reviews_per_page( response) if reqs: return self.send_next_request(reqs, response) else: return product def _scrape_total_matches(self, response): data = re.findall(r'site_search_results: "(.+)"', response.body_as_unicode()) if data: totals = data[0] if totals.isdigit(): if not self.TOTAL_MATCHES: self.TOTAL_MATCHES = int(totals) return int(totals) else: return 0 def _scrape_product_links(self, response): for link in response.xpath( '//div[contains(@id, "tableSearchResultsPhoto")]/a/@href' ).extract(): yield link, SiteProductItem() # def _get_nao(self, url): # nao = re.search(r'pn=(\d+)', url) # if not nao: # return # return int(nao.group(1)) # # def _replace_nao(self, url, new_nao): # current_nao = self._get_nao(url) # if current_nao: # return re.sub(r'nao=\d+', 'pn=' + str(new_nao), url) # else: # return url + '&pn=' + str(new_nao) def _scrape_next_results_page_link(self, response): if self.TOTAL_MATCHES is None: self.log('No "next result page" link!') return if self.CURRENT_NAO > self.TOTAL_MATCHES + self.PAGINATE_BY: return # it's over self.CURRENT_NAO += self.PAGINATE_BY return Request(self.PAGINATE_URL.format( search_term=response.meta['search_term'], nao=str(self.CURRENT_NAO)), callback=self.parse, meta=response.meta)
def __init__(self, *args, **kwargs): super(ChewyProductsSpider, self).__init__(*args, **kwargs) self.br = BuyerReviewsBazaarApi(called_class=self)
def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) super(DebenhamsProductSpider, self).__init__(*args, **kwargs)
def __init__(self, *args, **kwargs): self.br = BuyerReviewsBazaarApi(called_class=self) self.start_index = 0 super(MicrosoftStoreProductSpider, self).__init__(*args, **kwargs)