예제 #1
0
                         repr(item['price']))
                raise e
        else:
            price = Decimal('0')
        price = self.transform_price(price)
        loader.add_value('price', price)

        if item.get('asin') and item.get('seller_identifier'):
            loader.add_value(
                'url',
                AmazonUrlCreator.build_url_from_asin_and_dealer_id(
                    self.domain, item['asin'], item['seller_identifier']))
        elif item.get('asin'):
            loader.add_value(
                'url',
                AmazonUrlCreator.build_url_from_asin(self.domain,
                                                     item['asin']))
        elif self.use_amazon_identifier:
            loader.add_value(
                'url',
                AmazonUrlCreator.build_url_from_asin(self.domain,
                                                     item['identifier']))
        elif item.get('url'):
            loader.add_value('url', item['url'])

        # take sku from model if configured to do so
        if item.get('model') and self.model_as_sku:
            model = item['model']
            if len(model) > MAX_SKU_LEN:
                model = model[:252] + '...'

            loader.add_value('sku', model)
예제 #2
0
 def get_url_from_asin(self, asin):
     return AmazonUrlCreator.build_url_from_asin(
         self.domain,
         asin,
     )
예제 #3
0
    def scrape_mbc_list_page(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        try:
            url = hxs.select('//a[@id="olpDetailPageLink"]/@href').extract()[0]
            url = urljoin(base_url, url)
            url_parts = url.split('/')
            try:
                asin = url_parts[url_parts.index('product') + 1]
            except ValueError:
                asin = url_parts[url_parts.index('dp') + 1]
        except IndexError:
            return None

        products = []
        for i, result in enumerate(
                hxs.select(
                    '//div[@id="olpOfferList"]//div[contains(@class, "olpOffer")]'
                ), 1):
            product = {}

            name = ' '.join(
                hxs.select(u'//div[@id="olpProductDetails"]/h1//text()').
                extract()).strip()
            product['name'] = AmazonFilter.filter_name(name)

            brand = hxs.select(
                u'//div[@id="olpProductByline"]/text()').extract()
            if brand:
                product['brand'] = AmazonFilter.filter_brand(brand[0])

            price_el = result.select(
                './/span[contains(@class, "olpOfferPrice")]/text()')
            if not price_el:
                # check if there is text "Add to basket to check price"
                price_text = result.select(
                    './/div[p[contains(@class, "olpShippingInfo")]]/text()'
                ).extract()[0].strip()
                if 'basket' in price_text.lower():
                    product['price'] = None
                else:
                    raise AmazonScraperException(
                        "Couldn't extract price from element %d from url %s" %
                        (i, response.url))
            else:
                price = price_el.extract()[0].strip()
                product['price'] = self._extract_price(response.url, price)

            seller_id = None
            seller_urls = result.select(
                u'.//*[contains(@class, "olpSellerName")]//a/@href').extract()
            if seller_urls:
                seller_url_ = seller_urls[0]
                if 'seller=' in seller_url_:
                    seller_id = url_query_parameter(seller_url_, 'seller')
                else:
                    seller_parts = seller_url_.split('/')
                    try:
                        seller_id = seller_parts[seller_parts.index('shops') +
                                                 1]
                    except (IndexError, KeyError, ValueError):
                        # External website (link "Shop this website"?)
                        seller_id = url_query_parameter(
                            seller_url_, 'merchantID')

            product['identifier'] = asin
            product['asin'] = asin
            if seller_id:
                product['seller_identifier'] = seller_id
                product[
                    'url'] = AmazonUrlCreator.build_url_from_asin_and_dealer_id(
                        AmazonUrlCreator.get_domain_from_url(response.url),
                        asin, seller_id)
                product['seller_url'] = AmazonUrlCreator.build_vendor_url(
                    AmazonUrlCreator.get_domain_from_url(response.url),
                    seller_id)
                # product['url'] = 'http://%s/gp/product/%s/?m=%s' % (self._get_domain_from_url(response.url), product_id, seller_id)
            else:
                product['seller_identifier'] = None
                product['url'] = AmazonUrlCreator.build_url_from_asin(
                    AmazonUrlCreator.get_domain_from_url(response.url), asin)
                product['seller_url'] = None
                # product['url'] = 'http://%s/gp/product/%s/' % (self._get_domain_from_url(response.url), product_id)

            shipping = result.select(
                './/span[@class="olpShippingPrice"]/text()').extract()
            if shipping:
                product['shipping_cost'] = shipping[0]

            image_url = hxs.select(
                u'//div[@id="olpProductImage"]//img/@src').extract()
            if image_url:
                product['image_url'] = urljoin(base_url, image_url[0])

            vendor = result.select(
                u'.//div[contains(@class, "olpSellerColumn")]//img/@title'
            ).extract()
            if not vendor:
                vendor = result.select(
                    u'.//div[contains(@class, "olpSellerColumn")]//img/@alt'
                ).extract()
            if not vendor:
                vendor = result.select(
                    u'.//*[contains(@class, "olpSellerName")]//a/b/text()'
                ).extract()
            if not vendor:
                vendor = result.select(
                    u'.//*[contains(@class, "olpSellerName")]//span/a/text()'
                ).extract()
            if vendor:
                vendor = vendor[0]
                if vendor.lower().startswith('amazon'):
                    vendor = 'Amazon'
                else:
                    vendor = 'AM - ' + vendor
                product['vendor'] = vendor
            elif not seller_id:
                product['vendor'] = 'Amazon'
            else:
                product['vendor'] = None

            stock = result.select(
                './/div[contains(@class,"olpDeliveryColumn")]//text()').re(
                    'En Stock|En stock')
            if stock:
                product['unavailable'] = False

            products.append(product)

        next_url = hxs.select(
            '//ul[@class="a-pagination"]/li[@class="a-last"]/a/@href').extract(
            )
        next_url = urljoin(base_url, next_url[0]) if next_url else None

        current_page = hxs.select(
            '//ul[@class="a-pagination"]/li[@class="a-selected"]/a/text()'
        ).extract()
        current_page = current_page[0] if current_page else None

        return {
            'next_url': next_url,
            'current_page': current_page,
            'products': products
        }