class WoolWorthsSpider(BaseSpider): name = 'toymonitor-woolworths.co.uk' allowed_domains = ['woolworths.co.uk'] start_urls = ['http://www.woolworths.co.uk/brand-store.page?end=5132'] errors = [] brand_selector = BrandSelector(errors) #field_modifiers = {'brand': brand_selector.get_brand} def parse(self, response): hxs = HtmlXPathSelector(response) file_path = HERE + '/Brandstomonitor.xlsx' wb = xlrd.open_workbook(file_path) sh = wb.sheet_by_index(0) brands_to_monitor = [] for rownum in xrange(sh.nrows): if rownum < 1: continue row = sh.row_values(rownum) brands_to_monitor.append(re.sub(r'\W+', '', row[0].upper().strip())) site_brands = hxs.select('//div[@class="columns"]/ul/li/a') for brand in site_brands: brand_name = brand.select('text()').extract()[0].split( '(')[0].strip() brand_url = brand.select('@href').extract()[0] if re.sub(r'\W+', '', brand_name.upper()) in brands_to_monitor: brand_url = urljoin_rfc(get_base_url(response), brand_url) yield Request(brand_url, callback=self.parse_brand, meta={'brand': brand_name}) def parse_brand(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//a[@class="productTitle"]/@href').extract() for product in products: yield Request(product, callback=self.parse_product, meta=response.meta) next = hxs.select('//a[@class="paginationNext"]/@href').extract() if next: next = urljoin_rfc(get_base_url(response), next[0]) yield Request(next, callback=self.parse_brand, meta=response.meta) def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) name = ''.join( hxs.select('//h1[@class="productHeading"]//text()').extract()) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand', '')) category = re.findall(u',\\ncategory: "(.*)",', response.body) category = category[0] if category else '' loader.add_value('category', category) loader.add_xpath('sku', '//span[@id="catalogueNumber"]/text()') loader.add_xpath('identifier', '//span[@id="catalogueNumber"]/text()') image_url = hxs.select( '//div[@id="amp-originalImage"]/img/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) price = ''.join( hxs.select('//div[@class="priceNow"]//text()').extract()) loader.add_value('price', price) out_of_stock = 'IN STOCK' not in ''.join( hxs.select('//meta[@property="product:availability"]/@content'). extract()).upper() if out_of_stock: loader.add_value('stock', '0') item = loader.load_item() metadata = ToyMonitorMeta() ean = ''.join( hxs.select('//span[@id="productEAN"]/text()').extract()).strip() if ean: metadata['ean'] = ean item['metadata'] = metadata yield item
class JohnLewisSpider(BaseSpider): name = 'toymonitor-johnlewis.com' allowed_domains = ['johnlewis.com', 'johnlewis.ugc.bazaarvoice.com'] start_urls = ['http://www.johnlewis.com/browse/toys/toys/toys-by-brand/_/N-fev', 'http://www.johnlewis.com/toys/toys-by-type/c60000243?rdr=1'] errors = [] brand_selector = BrandSelector(errors) #field_modifiers = {'brand': brand_selector.get_brand} def start_requests(self): country_url = "http://www.johnlewis.com/store/international/ajax/changeCountryAjaxRequest.jsp" formdata = {'country': 'GB', 'sourceUrl': 'http://www.johnlewis.com/', 'switchToggle': 'Change Country Overlay'} yield FormRequest(country_url, formdata=formdata, callback=self.parse_country) def parse_country(self, response): for url in self.start_urls: yield Request(url) def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) site_brands = response.xpath('//section[@id="facet-brand"]/div/ul/li/a') for brand in site_brands: brand_name = brand.select('text()').extract()[0].split("(")[0].strip() brand_url = brand.select('@href').extract()[0] brand_url = urljoin_rfc(base_url, brand_url) yield Request(brand_url, callback=self.parse_brand) if response.meta.get('subcategory'): return subcats = response.xpath('//strong[contains(., "Featured Toy Types")]/following-sibling::ul//@href').extract() subcats += response.xpath('//section[@id="facet-toysbytype"]/div/ul/li/a/@href').extract() subcats += response.xpath('//header[contains(h2, "Toys by Type")]/following-sibling::div//@href').extract() subcats.append('http://www.johnlewis.com/browse/toys/toys/toys-by-type/games-puzzles/view-all-games-puzzles/_/N-6hxe') for url in subcats: yield Request(response.urljoin(url), meta={'subcategory': True}) def parse_brand(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) products = hxs.select("//div[@class='products']/div/article//a[@class='product-link'][1]/@href").extract() products += response.meta.get('products', []) next_page = filter(lambda link: link != '#', hxs.select('//li[@class="next"]//a/@href').extract()) if next_page: self.log('PARTIAL => %s products found' % len(products)) yield Request(url=urljoin_rfc(base_url, next_page[0]), meta={'products': list(products)}, callback=self.parse_brand) else: self.log('TOTAL PRODUCTS FOUND: %s' % len(products)) products = set(products) self.log('TOTAL UNIQUE PRODUCTS URLS: %s' % len(products)) for url in products: yield Request(urljoin_rfc(base_url, url), self.parse_product) def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('normalize-space(//*[@itemprop="name"]/text())').extract()[0] brand = hxs.select('normalize-space(//*[@itemprop="brand"]/span/text())').extract() try: image_url = urljoin_rfc(base_url, hxs.select('//div[@id="prod-media-player"]' '//img/@src').extract()[0].strip()) except IndexError: image_url = '' options = hxs.select('//div[@id="prod-multi-product-types"]') items = [] if options: products = options.select('.//div[@class="product-type"]') for product in products: opt_name = product.select('.//h3/text()').extract()[0].strip() try: stock = product.select('//div[contains(@class, "mod-stock-availability")]' '//p/strong/text()').re(r'\d+')[0] except IndexError: stock = 0 loader = ProductLoader(item=Product(), selector=product) sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model name")]/following-sibling::dd/text()').extract() if not sku: sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model Number")]/following-sibling::dd/text()').extract() if sku: loader.add_value('sku', sku[0].strip()) loader.add_xpath('identifier', './/div[contains(@class, "mod-product-code")]/p/text()') loader.add_value('name', '%s %s' % (name, opt_name)) loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()') loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_xpath('price', './/p[@class="price"]/strong/text()') loader.add_value('stock', stock) item = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] item['metadata'] = metadata items.append(item) else: price = ''.join(hxs.select('//ul/li/strong[@class="price"]/text()').extract()).strip() if not price: price = ''.join(hxs.select('//span[@class="now-price"]/text()').extract()).split() if not price: price = ''.join(hxs.select('//div[@id="prod-price"]//strong/text()').extract()).split() try: stock = hxs.select('//div[contains(@class, "mod-stock-availability")]' '//p/strong/text()').re(r'\d+')[0] except IndexError: stock = 0 loader = ProductLoader(item=Product(), response=response) sku = hxs.select(u'//div[@id="prod-product-code"]//h2[contains(text(),"Product code")]/following-sibling::p/text()').extract() if sku: loader.add_value('sku', sku[0].strip()) loader.add_xpath('identifier', '//div[@id="prod-product-code"]/p/text()') loader.add_value('name', name) loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()') loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('stock', stock) item = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] item['metadata'] = metadata if item.get('identifier'): items.append(item) if items: product_id = response.xpath('//div/@data-product-id').extract()[0] reviews_url = 'http://johnlewis.ugc.bazaarvoice.com/7051redes-en_gb/%s/reviews.djs?format=embeddedhtml&page=1&scrollToTop=true' yield Request(reviews_url % product_id, callback=self.parse_review_page, meta={'items': items, 'url': response.url}) def parse_review_page(self, response): items = response.meta.get('items', '') url = response.meta.get('url', '') hxs = HtmlXPathSelector(text=self._extract_html(response)) reviews = hxs.xpath('//div[@class="BVRRReviewDisplayStyle5"]') for review in reviews: l = ReviewLoader(item=Review(), response=response, date_format='%d/%m/%Y') rating = review.select(".//span[contains(@class,'BVRRRatingNumber')]/text()").extract()[0] date = review.select(".//span[contains(@class,'BVRRValue BVRRReviewDate')]/text()").extract()[0] title = review.select(".//span[contains(@class,'BVRRReviewTitle')]/text()").extract() review_text = ' '.join(review.select(".//span[contains(@class,'BVRRReviewText')]//text()").extract()) if title: full_text = title[0].strip() + '\n' + review_text.strip() else: full_text = review_text.strip() l.add_value('rating', rating) l.add_value('url', url) l.add_value('date', datetime.strptime(date, '%d %B %Y').strftime('%d/%m/%Y')) l.add_value('full_text', full_text) for item in items: item['metadata']['reviews'].append(l.load_item()) next = hxs.xpath('//span[@class="BVRRPageLink BVRRNextPage"]/a/@data-bvjsref').extract() if next: yield Request(next[0], callback=self.parse_review_page, meta={'items': items, 'url': url}) else: for item in items: yield item def _extract_html(self, response): review_html = '' for line in response.body.split('\n'): if 'var materials=' in line: review_html = line.split('"BVRRSecondaryRatingSummarySourceID":" ')[-1].split('\n}')[0].replace('\\', '') return review_html
class LittleWoodsSpider(BaseSpider): name = 'toymonitor-littlewoods.com' allowed_domains = ['littlewoods.com', 'api.bazaarvoice.com'] start_urls = ['http://www.littlewoods.com/toys/e/b/5132.end'] errors = [] brand_selector = BrandSelector(errors) #field_modifiers = {'brand': brand_selector.get_brand} def parse(self, response): categories = response.xpath('//div[@id="navigation"]//a/@href').extract() for category in categories: yield Request(response.urljoin(category), callback=self.parse_category) def parse_category(self, response): products = response.xpath('//a[@class="productTitle"]/@href').extract() for product in products: yield Request(product, callback=self.parse_product, meta=response.meta) next = response.xpath('//a[@class="paginationNext"]/@href').extract() if next: next = response.urljoin(next[0]) yield Request(next, callback=self.parse_category, meta=response.meta) def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) name = ''.join(response.xpath('//h1[@class="productHeading"]//text()').extract()) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand', '')) category = re.findall(u',\\ncategory: "(.*)",', response.body) category = category[0] if category else '' loader.add_value('category', category) loader.add_xpath('sku', '//span[@id="catalogueNumber"]/text()') loader.add_xpath('identifier', '//span[@id="catalogueNumber"]/text()') image_url = response.xpath('//div[@id="amp-originalImage"]/img/@src').extract() promotion = None if image_url: loader.add_value('image_url', image_url[0]) if '3for2' in image_url[0]: promotion = '3 for 2' price = ''.join(response.xpath('//div[@class="priceNow"]//text()').extract()) loader.add_value('price', price) out_of_stock = 'IN STOCK' not in ''.join(response.xpath('//meta[@property="product:availability"]/@content').extract()).upper() if out_of_stock: loader.add_value('stock', '0') item = loader.load_item() metadata = ToyMonitorMeta() ean = ''.join(response.xpath('//span[@id="productEAN"]/text()').extract()).strip() if ean: metadata['ean'] = ean metadata['reviews'] = [] if promotion is not None: metadata['promotions'] = promotion item['metadata'] = metadata items = [] amount_options = len(response.xpath('//ul[@class="customerSelection"]')) options = [] # Extract option arrays options_text = re.findall('stockMatrix = \[(.*) \]; sdg.productOptions', ' '.join(response.body.split())) if options_text: options_text = re.findall('(.*)]; sdg.productOptions', options_text[0]) for line in options_text[0].split(' , '): if '"sku' in line: option = re.search('\[(.*)\]', line) if option: option = re.search('\[(.*)\]', line).group(0).replace('null', 'None') options.append(eval(option)) if len(options)>1: for option in options: option_item = deepcopy(item) name = ' '.join(option[:amount_options]) identifier = option[amount_options] price = option[-5] option_item['name'] += ' ' + name option_item['identifier'] += '-' + identifier option_item['price'] = extract_price(price) out_of_stock = [value for value in option if value and 'out of stock' in value.lower()] if out_of_stock: option_item['stock'] = 0 items.append(option_item) else: items.append(item) product_id = re.findall('productId: "(.*)"', response.body)[0] reviews_url = 'http://api.bazaarvoice.com/data/batch.json?passkey=2x4wql4zeys4t8mu5x3x4rb1a&apiversion=5.5&displaycode=10628-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A'+product_id+'&filter.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&sort.q0=isfeatured%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv_1111_18822' request = Request(reviews_url, meta={'items': items, 'offset': 0, 'url': response.url}, callback=self.parse_reviews) yield request def parse_reviews(self, response): items = response.meta['items'] url = response.meta['url'] body = response.body.strip().partition('(')[-1].replace('});', '}').replace('})', '}') json_body = json.loads(body) reviews = json_body['BatchedResults']['q0']['Results'] for review in reviews: review_loader = ReviewLoader(item=Review(), response=response, date_format="%d/%m/%Y") review_date = datetime.datetime.strptime(review['SubmissionTime'].split('.')[0], '%Y-%m-%dT%H:%M:%S') review_loader.add_value('date', review_date.strftime('%d/%m/%Y')) title = review['Title'] text = review['ReviewText'] if title: full_text = title + '\n' + text else: full_text = text pros = review['Pros'] cons = review['Cons'] if pros: full_text += '\nPros: ' + ', '.join(pros) if cons: full_text += '\nCons: ' + ', '.join(cons) review_loader.add_value('full_text', full_text) rating = review['Rating'] review_loader.add_value('rating', rating) review_loader.add_value('url', url) for item in items: item['metadata']['reviews'].append(review_loader.load_item()) if len(reviews) == 100: offset = response.meta['offset'] + 100 next_reviews = add_or_replace_parameter(response.url, "offset.q0", str(offset)) request = Request(next_reviews, meta={'items': items, 'offset': offset, 'url': url}, callback=self.parse_reviews) yield request else: for item in items: yield item
class RakutenCoUk(BaseSpider): name = 'toymonitor-rakuten.co.uk' allowed_domains = ['www.rakuten.co.uk'] start_urls = ['http://www.rakuten.co.uk/category/931/?l-id=gb_product_allcat_17',] errors = [] brand_selector = BrandSelector(errors) #field_modifiers = {'brand': brand_selector.get_brand} def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) subcategory_urls = hxs.select('//li[@class="b-open"]//li/a/@href').extract() for url in subcategory_urls: yield Request(urljoin(base_url, url)) yield Request(response.url, callback = self.parse_products, dont_filter=True) def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) # parse pages pages = hxs.select('//div[contains(@class, "b-pagination")]/ul/li/a/@href').extract() for page in pages: yield Request(urljoin_rfc(base_url, page), meta=response.meta, callback=self.parse_products) # parse products items = hxs.select('//li[@class="b-item"]/div/div[@class="b-img"]/div/a/@href').extract() for item in items: yield Request(urljoin_rfc(base_url, item), callback=self.parse_product, meta=response.meta) def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) mpn = hxs.select('//span[@class="b-item"]').re("MPN: ([0-9]+)") ean = hxs.select('//span[@class="b-item"]').re("EAN: ([0-9]+)") sku = hxs.select('//input[@name="sku"]/@value').extract() name = hxs.select('//h1[@class="b-ttl-main"]/text()').extract()[0] dealer_name = "".join(hxs.select('//h2[@id="auto_shop_info_name"]//text()').extract()).strip() brand = hxs.select('.//span[@itemprop="brand"]/text()').extract() if brand: brand = brand[0].strip() else: brand = response.meta.get('brand') categories = hxs.select('//ul[@class="b-breadcrumb"]/li/a/text()').extract() image_url = hxs.select('//img[@itemprop="image"]/@data-frz-src').extract() ## options = hxs.select('//script[contains(text(), "var variant_details")]/text()').re('var variant_details = (.*);\n') options = hxs.select('//script[contains(text(), "var variant_details")]/text()').extract() if options: options = options[0].replace('"', "'") options = re.findall('var variant_details = (.*);\n', options) variants = json.loads(options[0]) else: identifier = hxs.select('//input[@name="item_id"]/@value').extract()[0] price = hxs.select('//div[@class="b-product-main"]//meta[@itemprop="price"]/@content').extract()[0] variants = [{'itemVariantId': identifier, 'sku': sku, 'variantValues': [], 'defaultPricing': {'price': price}}] items = [] for variant in variants: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', variant['itemVariantId']) loader.add_value('name', " ".join([name] + variant.get('variantValues', []))) loader.add_value('sku', variant['sku']) loader.add_value('url', response.url) loader.add_value('price', variant['defaultPricing']['price']) loader.add_value('dealer', dealer_name) loader.add_value('category', categories) if brand: loader.add_value('brand', brand) if image_url: loader.add_value('image_url', image_url[0]) product = loader.load_item() metadata = ToyMonitorMeta() metadata['reviews'] = [] product['metadata'] = metadata if mpn or ean: if mpn: metadata['mpn'] = mpn[0] if ean: metadata['ean'] = ean[0] product['metadata'] = metadata items.append(product) reviews_url = response.xpath('//a[contains(text(), "See All Reviews")]/@href').extract() if reviews_url: yield Request(reviews_url[0], callback=self.parse_reviews, meta={'items': items, 'url': response.url}) else: for item in items: yield item def parse_reviews(self, response): items = response.meta.get('items', '') url = response.meta.get('url', '') reviews = response.xpath('//div[contains(@class, "b-review")]') for review in reviews: l = ReviewLoader(item=Review(), response=response, date_format='%d/%m/%Y') rating = len(review.xpath('.//span/span[contains(@class, "b-star-full")]')) date = review.xpath('.//div[@class="b-content"]/span[@class="b-text-sub"]/text()').re('\d+/\d+/\d+')[0] title = review.xpath('.//div[@class="b-head"]/div/text()').extract() review_text = ' '.join(review.xpath('.//div[@class="b-content" and not(child::*)]/text()').extract()) if title: full_text = title[0].strip() + '\n' + review_text.strip() else: full_text = review_text.strip() l.add_value('rating', rating) l.add_value('url', url) l.add_value('date', date) l.add_value('full_text', full_text) for item in items: item['metadata']['reviews'].append(l.load_item()) next = response.xpath('//a[@id="right_arrow"]/@href').extract() if next: yield Request(next[0], callback=self.parse_reviews, meta={'items': items, 'url': url}) else: for item in items: yield item