repr(item['price'])) raise e else: price = Decimal('0') price = self.transform_price(price) loader.add_value('price', price) if item.get('asin') and item.get('seller_identifier'): loader.add_value( 'url', AmazonUrlCreator.build_url_from_asin_and_dealer_id( self.domain, item['asin'], item['seller_identifier'])) elif item.get('asin'): loader.add_value( 'url', AmazonUrlCreator.build_url_from_asin(self.domain, item['asin'])) elif self.use_amazon_identifier: loader.add_value( 'url', AmazonUrlCreator.build_url_from_asin(self.domain, item['identifier'])) elif item.get('url'): loader.add_value('url', item['url']) # take sku from model if configured to do so if item.get('model') and self.model_as_sku: model = item['model'] if len(model) > MAX_SKU_LEN: model = model[:252] + '...' loader.add_value('sku', model)
def get_url_from_asin(self, asin): return AmazonUrlCreator.build_url_from_asin( self.domain, asin, )
def scrape_mbc_list_page(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) try: url = hxs.select('//a[@id="olpDetailPageLink"]/@href').extract()[0] url = urljoin(base_url, url) url_parts = url.split('/') try: asin = url_parts[url_parts.index('product') + 1] except ValueError: asin = url_parts[url_parts.index('dp') + 1] except IndexError: return None products = [] for i, result in enumerate( hxs.select( '//div[@id="olpOfferList"]//div[contains(@class, "olpOffer")]' ), 1): product = {} name = ' '.join( hxs.select(u'//div[@id="olpProductDetails"]/h1//text()'). extract()).strip() product['name'] = AmazonFilter.filter_name(name) brand = hxs.select( u'//div[@id="olpProductByline"]/text()').extract() if brand: product['brand'] = AmazonFilter.filter_brand(brand[0]) price_el = result.select( './/span[contains(@class, "olpOfferPrice")]/text()') if not price_el: # check if there is text "Add to basket to check price" price_text = result.select( './/div[p[contains(@class, "olpShippingInfo")]]/text()' ).extract()[0].strip() if 'basket' in price_text.lower(): product['price'] = None else: raise AmazonScraperException( "Couldn't extract price from element %d from url %s" % (i, response.url)) else: price = price_el.extract()[0].strip() product['price'] = self._extract_price(response.url, price) seller_id = None seller_urls = result.select( u'.//*[contains(@class, "olpSellerName")]//a/@href').extract() if seller_urls: seller_url_ = seller_urls[0] if 'seller=' in seller_url_: seller_id = url_query_parameter(seller_url_, 'seller') else: seller_parts = seller_url_.split('/') try: seller_id = seller_parts[seller_parts.index('shops') + 1] except (IndexError, KeyError, ValueError): # External website (link "Shop this website"?) seller_id = url_query_parameter( seller_url_, 'merchantID') product['identifier'] = asin product['asin'] = asin if seller_id: product['seller_identifier'] = seller_id product[ 'url'] = AmazonUrlCreator.build_url_from_asin_and_dealer_id( AmazonUrlCreator.get_domain_from_url(response.url), asin, seller_id) product['seller_url'] = AmazonUrlCreator.build_vendor_url( AmazonUrlCreator.get_domain_from_url(response.url), seller_id) # product['url'] = 'http://%s/gp/product/%s/?m=%s' % (self._get_domain_from_url(response.url), product_id, seller_id) else: product['seller_identifier'] = None product['url'] = AmazonUrlCreator.build_url_from_asin( AmazonUrlCreator.get_domain_from_url(response.url), asin) product['seller_url'] = None # product['url'] = 'http://%s/gp/product/%s/' % (self._get_domain_from_url(response.url), product_id) shipping = result.select( './/span[@class="olpShippingPrice"]/text()').extract() if shipping: product['shipping_cost'] = shipping[0] image_url = hxs.select( u'//div[@id="olpProductImage"]//img/@src').extract() if image_url: product['image_url'] = urljoin(base_url, image_url[0]) vendor = result.select( u'.//div[contains(@class, "olpSellerColumn")]//img/@title' ).extract() if not vendor: vendor = result.select( u'.//div[contains(@class, "olpSellerColumn")]//img/@alt' ).extract() if not vendor: vendor = result.select( u'.//*[contains(@class, "olpSellerName")]//a/b/text()' ).extract() if not vendor: vendor = result.select( u'.//*[contains(@class, "olpSellerName")]//span/a/text()' ).extract() if vendor: vendor = vendor[0] if vendor.lower().startswith('amazon'): vendor = 'Amazon' else: vendor = 'AM - ' + vendor product['vendor'] = vendor elif not seller_id: product['vendor'] = 'Amazon' else: product['vendor'] = None stock = result.select( './/div[contains(@class,"olpDeliveryColumn")]//text()').re( 'En Stock|En stock') if stock: product['unavailable'] = False products.append(product) next_url = hxs.select( '//ul[@class="a-pagination"]/li[@class="a-last"]/a/@href').extract( ) next_url = urljoin(base_url, next_url[0]) if next_url else None current_page = hxs.select( '//ul[@class="a-pagination"]/li[@class="a-selected"]/a/text()' ).extract() current_page = current_page[0] if current_page else None return { 'next_url': next_url, 'current_page': current_page, 'products': products }