Exemplo n.º 1
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        url = urljoin_rfc(base_url, response.url)
        image_url = hxs.select('//img[@id="product-image-main"]/@src').extract()
        product_name = hxs.select('//*[@id="product-header"]//h1/text()').extract()
        if product_name:
            product_name = product_name[0].strip()
        else:
            log.msg('Skips product without name: ' + response.url)
            return

        category = hxs.select('//div[@class="crumbs"]/span/a/span/text()').extract()[-1]
        brand = hxs.select('//*[@id="product-header"]/a/img/@alt').extract()
        brand = brand[0] if brand else ''
        options = hxs.select('//table[@class="child-list with-hover"][1]/tbody/tr')
        if options:
            for option in options:
                columns = option.select('./td')
                name = ''
                sku = ''
                get_name = 1
                in_stock = 1
                identifier = ''
                for column in columns:
                    ctype = column.select('./@class').extract()[0]
                    if ctype == 'code':
                        get_name = 0
                        name = product_name + name
                        sku = column.select('./text()').extract()[0]
                    if get_name:
                        name += ' - ' + column.select('./text()').extract()[0]
                    if ctype == 'price':
                        price = column.select('.//input/@value').extract()[-1]
                        price = extract_price(price)
                    if ctype == 'status out-of-stock':
                        in_stock = 0

                
                identifier = sku
                loader = ProductLoader(item=Product(), selector=option)
                loader.add_value('identifier', identifier)
                loader.add_value('url', url)
                colour = hxs.select('//li[.//td[text()="'+sku+'"]]/div[contains(@class, "colour")]/p/text()').extract()
                if colour:
                    name = name + ' ' + colour[0]
                loader.add_value('name', name)
                if image_url:
                    loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
                loader.add_value('price', price)
                loader.add_value('sku', sku)
                loader.add_value('brand', brand)
                loader.add_value('category', category)
                if not in_stock:
                    loader.add_value('stock', 0)
                if price <= 49.99:
                    loader.add_value('shipping_cost', 3.95)
                else:
                    loader.add_value('shipping_cost', 0)
                yield loader.load_item()

        else:
            options = hxs.select('//div[@class="product-options"]//option[not(@title="Not Selected")]')

            if options:

                try:
                    options_mappings = json.loads(re.findall(re.compile("childMap\': (\{.+?}),\n"), response.body)[0])
                    options_prices = json.loads(re.findall(re.compile("prices\': (\{.+?}),\n"), response.body)[0])
                    options_skus = json.loads(re.findall(re.compile("skus\': (\{.+?}),\n"), response.body)[0])
                    options_stocks = json.loads(re.findall(re.compile("stockStatuses\': (\{.+?}),\n"), response.body)[0])
                except:
                    return

                for option in options:

                    loader = ProductLoader(item=Product(), selector=hxs)

                    option_name = product_name + ' ' + option.select("./@title").extract()[0]
                    option_id = option.select("./@value").extract()[0]
                    option_mapping = str(options_mappings[option_id])

                    option_price = extract_price(str(options_prices[option_mapping][0]['purchase']))
                    option_sku = options_skus[option_mapping]
                    option_stock = 1 if not 'Out' in options_stocks[option_mapping] else 0

                    loader.add_value('identifier', option_sku)
                    loader.add_value('sku', option_sku)
                    loader.add_value('url', url)
                    loader.add_value('name', option_name)
                    loader.add_value('price', option_price)
                    loader.add_value('brand', brand)
                    loader.add_value('category', category)
                    loader.add_value('stock', option_stock)

                    if image_url:
                        loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
                    if option_price < 49.99:
                        loader.add_value('shipping_cost', 3.95)
                    else:
                        loader.add_value('shipping_cost', 0)

                    yield loader.load_item()


            else:
                loader = ProductLoader(item=Product(), selector=hxs)
                sku = hxs.select('//div[@class="title"]//p/text()').extract()[0]
                sku = sku.replace('Product Code: P', '')
                identifier = sku
                loader.add_value('identifier', identifier)
                loader.add_value('sku', sku)
                loader.add_value('url', url)
                loader.add_value('name', product_name)
                if image_url:
                    loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
                price = hxs.select('//*[@id="product-price"]//input/@value').extract()[0]
                price = extract_price(price)
                loader.add_value('price', price)
                loader.add_value('brand', brand)
                loader.add_value('category', category)
                in_stock = hxs.select('//*[@id="product-stock"]/text()').extract()[0]
                if in_stock != 'In stock':
                    loader.add_value('stock', 0)
                if price < 49.99:
                    loader.add_value('shipping_cost', 3.95)
                else:
                    loader.add_value('shipping_cost', 0)
                yield loader.load_item()
Exemplo n.º 2
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        category = response.meta.get('category', '')

        for url in hxs.select('//div[@class="pages"]//a/@href').extract():
            yield Request(url,
                          callback=self.parse_products,
                          meta=response.meta)

        products = hxs.select('//li[contains(@class, "item")]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            try:
                model = map(
                    unicode.strip,
                    product.select('.//p[contains(text(), "model: ")]/text()').
                    re(r'model: (.*)'))[0]
            except:
                model = ''
            name = product.select(
                './/h2[@class="product-name"]/a/text()').extract()
            if name:
                name = name[0].strip()
            else:
                name = ''
            loader.add_value('name', ' '.join((name, model)))
            url = product.select(
                './/h2[@class="product-name"]/a/@href').extract()[0].strip()
            identifier = product.select(
                './/span[contains(@id, "product-price-")]/@id').re(
                    r'product-price-(\d+)')
            if not identifier:
                identifier = product.select(
                    './/ul[@class="add-to-links"]/li/a[@class="link-compare" or @class="link-wishlist"]/@href'
                ).re('product/(.*?)/')
            if identifier:
                prod_id = identifier[0]
                loader.add_value('identifier', prod_id)
            loader.add_value('url', url.split('?')[0])
            try:
                brand = map(
                    unicode.strip,
                    product.select(
                        './/p[contains(text(), "manufacturer: ")]/text()').re(
                            r'manufacturer: (.*)'))[0]
            except:
                brand = product.select('td[3]//text()').extract()
            loader.add_value('brand', brand)
            if model:
                loader.add_value('sku', model)
            image_url = product.select(
                './/a[@class="product-image"]/img/@src').extract()
            if image_url:
                loader.add_value('image_url',
                                 urljoin_rfc(base_url, image_url[0]))
            try:
                price = product.select(
                    './/span[contains(@id, "product-price-")]/span[@class="price"]/text()'
                ).extract()[0].strip()
            except:
                try:
                    price = product.select(
                        './/span[contains(@id, "product-price-") and contains(@class, "price")]/text()'
                    ).extract()[0].strip()
                except:
                    price = '0.0'
            loader.add_value('price', price)

            loader.add_value('category', category)

            if loader.get_collected_values(
                    'identifier') and loader.get_collected_values(
                        'identifier')[0]:
                product = loader.load_item()
                if product['price'] > 0:
                    yield product
            else:
                self.log('IDENTIFIER NOT FOUND!!! {}'.format(
                    loader.get_output_value('url')))
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        name = response.css('.product-name').xpath('h1/text()').extract_first()
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        sname = name.lower()
        for brand in self.brands:
            if sname.startswith(brand):
                loader.add_value('brand', brand.title())
                break
        categories = response.css('.breadcrumbs').xpath(
            './/a/span/text()').extract()[1:]
        loader.add_value('category', categories)
        sku = hxs.select(
            '//*[@id="product_addtocart_form"]//div[@class="expert-notes "]//span[contains(text(), "SKU: ")]/text()'
        ).extract()
        if sku:
            sku = sku[0].replace("SKU: ", '')
        else:
            sku = ''
        loader.add_value('sku', sku)
        identifier = hxs.select('//input[@name="product"]/@value').extract()[0]
        loader.add_value('identifier', identifier + '-new')
        image_url = hxs.select('//img[@id="image-main"]/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        price = response.xpath('//script/text()').re('price":"(.+?)"')
        price = extract_price(price[0]) if price else 0

        loader.add_value('price', price)

        in_stock = hxs.select(
            '//div[@class="availability in-stock"]//div[@class="value" and contains(text(), "In stock")]'
        )
        if not in_stock:
            in_stock = hxs.select(
                '//p[@class="availability back-order"]//span[@class="value" and contains(text(), "Back Order")]'
            )

        if not in_stock:
            loader.add_value('stock', 0)

        if loader.get_output_value('price') < 100:
            loader.add_value('shipping_cost', 6.50)

        item = loader.load_item()

        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join(
                            (products.get(product, ''), option['label']))
                        prices[product] = prices.get(
                            product, 0) + extract_price(option['price'])

            base_price = extract_price(product_data['basePrice'])
            for option_identifier, option_name in products.iteritems():
                option_item = deepcopy(item)

                option_item['identifier'] += '-' + option_identifier
                option_item['name'] += option_name
                option_item['price'] = base_price + prices[option_identifier]
                yield option_item
        else:
            yield item
Exemplo n.º 4
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        identifier = hxs.select(
            u'//form[@name="orderform"]/input[@name="productid"]/@value'
        ).extract()

        if identifier:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('url', response.url)
            product_loader.add_xpath('name',
                                     u'//h1[@class="dialog_title"]/text()')
            product_loader.add_xpath('sku',
                                     u'//span[@class="sku-code"]/text()')
            product_loader.add_value('identifier', identifier)
            product_loader.add_xpath('price',
                                     u'//span[@id="product_price"]/text()')
            product_loader.add_value('category', response.meta.get('category'))

            options = []
            names = {}
            for line in response.body.split('\n'):
                m = re.search(
                    'variants\[.*\] = \[\[([\d\.,]+),\d+,new Image.*\'([^\']+)\'',
                    line)
                if m:
                    g = m.groups()
                    options.append([g[0], g[1], []])
                    continue
                m = re.search('variants\[.* = (.+);', line)
                if m:
                    g = m.groups()
                    options[-1][2].append(g[0])
                    continue
                m = re.search('names.*\[([^\]]+)\] = "(.+)";', line)
                if m:
                    g = m.groups()
                    names[g[0]] = g[1]
                    continue

            product_loader.add_xpath(
                'brand',
                u'normalize-space(//div[contains(@class, "order-info")]/div/a/@title)'
            )
            try:
                img = hxs.select('//img[@itemprop="image"]/@src').extract()[0]
                product_loader.add_value(
                    'image_url', urljoin_rfc(get_base_url(response), img))
            except:
                pass

            product = product_loader.load_item()
            if options:
                for price, sku, ids in options:
                    prod = Product(product)
                    prod['name'] = prod['name'] + ' (' + ' '.join(
                        [names[id] for id in ids]) + ')'
                    prod['sku'] = sku
                    prod['identifier'] = prod['identifier'] + ':' + '.'.join(
                        ids)
                    prod['price'] = Decimal(price)
                    yield prod
            else:
                yield product
Exemplo n.º 5
0
    def parse_product(response):
        hxs = HtmlXPathSelector(response)

        opt_groups = []
        inside = False
        lst = ''
        for line in response.body.split('\n'):
            if line.startswith('perms[\''):
                inside = True
                lst = ''
            elif line.startswith('];'):
                if lst:
                    opts = eval('[' + lst + ']')
                    # XXX http://www.thesleepshop.co.uk/acatalog/4ft6_Double_Kyoto_Memphis_Futon.html#a11717
                    # second option has "Deluxe Mattress" twice with different additional price
                    # however price calculation ignores second addition price (uses first value)
                    filtered_opts = []
                    for price, name in opts:
                        if not [name for pn in filtered_opts if pn[1] == name]:
                            filtered_opts.append([price, name])
                    opt_groups.append(filtered_opts)
                inside = False
            elif inside:
                lst += line

        identifier = hxs.select(
            '//form//input[contains(@name, "Q_")]/@name').re(r'Q_(.*)$')[0]

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h3[@class="product"]/text()')
        product_loader.add_xpath('name', u'//span[@class="product"]/text()')
        product_loader.add_value('sku', identifier)
        product_loader.add_value('identifier', identifier)
        product_loader.add_value('category', response.meta.get('category'))
        product_loader.add_xpath(
            'price',
            u'//div[@align="left"]/div/div[contains(text(),"Now")]/text()')
        if not product_loader.get_output_value('price'):
            product_loader.add_xpath(
                'price', u'//div[@id="price_inside"]//span//text()')
        if not product_loader.get_output_value('price'):
            product_loader.add_xpath(
                'price', u'//div[@id="price_inside"]//span/@ppraw')
        if not product_loader.get_output_value('price'):
            product_loader.add_value('price', '')

        img = hxs.select(
            u'//div[@class="slides_control"]/a/img/@src').extract()
        if not img:
            img = hxs.select(
                u'//div[@class="image_product"]//img/@src').extract()
        product_loader.add_value('image_url',
                                 urljoin_rfc(get_base_url(response), img[0]))

        brand_logo = hxs.select(
            u'//h3[@class="product"]/../img/@src').extract()
        if not brand_logo:
            brand_logo = hxs.select(
                u'//h3[@class="product"]/img/@src').extract()

        brands = {
            '6thsense.jpg': '6th sense',
            'bentley.gif': 'bentley',
            'birlea.gif': 'birlea',
            'blank.gif': '',
            'brand': '',
            'Breasley.gif': 'breasley',
            'buoyant.jpg': 'buoyant',
            'cro.gif': 'cro',
            'cumfilux.gif': 'cumfilux',
            'dt.gif': 'dt',
            'dunlopillo.gif': 'dunlopillo',
            'durabeds.gif': 'durabeds',
            'easycomfort.gif': 'easy comfort',
            'friendship_mill.gif': 'friendship mill',
            'Furmanac.gif': 'furmanac',
            'gainsborough.gif': 'fainsborough',
            'gleneagle.gif': 'gleneagle',
            'harlequin.gif': 'harlequin',
            'harmony.gif': 'harmony',
            'healthbeds.gif': 'healt beds',
            'highgate.gif': 'highgate',
            'hypnos.gif': 'hypnos',
            'jay-be.gif': 'jay be',
            'julianbowenlogo.jpg': 'julian bowen',
            'kaymed.gif': 'kaymed',
            'komfi.gif': 'komfi',
            'kyoto.gif': 'kyoto',
            'limelight.gif': 'limelight',
            'metalbeds.gif': 'metalbeds',
            'millbrook.gif': 'millbrook',
            'myers.gif': 'myers',
            'nd.gif': 'newdesign',
            'nestledown.gif': 'nestledown',
            'obc.gif': 'original bedstead',
            'Protectabed.gif': 'protectabed',
            'rauch.gif': 'rauch',
            'relaxsan.gif': 'relaxsan',
            'relyon.gif': 'relyon',
            'rest_assured.gif': 'rest assured',
            'richman.gif': 'richman',
            'sealy.gif': 'sealy',
            'shakespeare.gif': 'shakespeare',
            'silentnight.gif': 'silentnight',
            'sleepeezee.gif': 'sleepeezee',
            'sleepshaper.gif': 'sleepshaper',
            'sleepyvalley.gif': 'sleepyvalley',
            'slumberland.gif': 'slumberland',
            'staples.gif': 'staples',
            'steens.gif': 'steens',
            'swanglen.gif': 'swanglen',
            'sweetdreams.gif': 'sweetdreams',
            'tss.gif': 'the sleep shop',
            'verona.jpg': 'verona',
            'welcome.gif': 'welcome furniture',
        }
        product_loader.add_value(
            'brand', brands.get(brand_logo[0],
                                remove_extension(brand_logo[0])))
        product = product_loader.load_item()
        for opt_price, opt_name in multiply(opt_groups):
            prod = Product(product)
            prod['name'] = (prod['name'] + ' ' + opt_name).strip()
            try:
                prod['price'] = (Decimal(prod['price']) +
                                 Decimal(opt_price)).quantize(Decimal('1.00'))
            except TypeError:
                prod['price'] = Decimal(0)
            prod['identifier'] = prod['identifier'] + ':' + opt_name
            yield prod
Exemplo n.º 6
0
class RDGToolsSpider(BaseSpider):
    name = 'rdgtools.co.uk'
    allowed_domains = ['rdgtools.co.uk', 'www.rdgtools.co.uk']
    start_urls = (u'http://www.rdgtools.co.uk/',)

    def _start_requests(self):
        yield Request('http://www.rdgtools.co.uk/acatalog/Proxxon-Drilling---Grinding-Bits.html',
                      callback=self.parse_product)

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        parsed_ids = []

        cats = hxs.select('//*[@id="sidebar"]//li[@class="sections-list"]/a/@href').extract()
        if cats:
            for cat in cats:
                yield Request(
                    url=urljoin_rfc(base_url, cat)
                )

        subcats = hxs.select('//*[@id="ContentPage"]//span[@class="boxheading"]/a/@href').extract()
        if subcats:
            for subcat in subcats:
                yield Request(
                    url=urljoin_rfc(base_url, subcat)
                )

        for url in hxs.select('//div[@id="content"]//form//a[@class="read-more"]/@href').extract():
            if url:
                try:
                    pid = url.split('.')[-2].split('-')[-1]
                    if pid not in parsed_ids:
                        parsed_ids.append(pid)
                    yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
                except:
                    pass

        for url in hxs.select('//div[@id="content"]//form//h1/../../a/@href').extract():
            if url:
                try:
                    pid = url.split('.')[-2].split('-')[-1]
                    if pid not in parsed_ids:
                        parsed_ids.append(pid)
                    yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
                except:
                    pass

        category = hxs.select('//*[@id="ContentPage"]/p[@class="text_breadcrumbs"]/a/text()').extract()[1:]

        for product in hxs.select('//*[@id="ContentPage"]//div[@class="col-xs-12"]'):
            pid = product.select('.//input[contains(@name, "Q_")]/@name').re(r'Q_(.+)')
            if pid:
                pid = pid[0]
                if pid not in parsed_ids:
                    parsed_ids.append(pid)
                    name = ''.join(product.select('.//h1//text()').extract())
                    url = response.url
                    image_url = product.select('.//img[@class="catalog-image"]/@src').extract()
                    image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
                    price = product.select('.//span[@class="catalog-price" or @class="product-price"]/text()').re(u'\xa3([\d\.,]+)')
                    price = price[0]
                    loader = ProductLoader(item=Product(), selector=product)
                    loader.add_value('url', url)
                    loader.add_value('name', name)
                    loader.add_value('sku', pid)
                    loader.add_value('identifier', pid)
                    loader.add_value('category', category)
                    loader.add_value('image_url', image_url)
                    loader.add_value('price', price)
                    yield loader.load_item()

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        category = hxs.select('//*[@id="idBreadcrumbsTop"]/p[@class="text_breadcrumbs"]/a/text()').extract()[1:]

        url = response.url
        try:
            sku = hxs.select('//input[contains(@name, "Q_")]/@name').re(r'Q_(.+)')[0]

        except Exception, e:
            self.log('NO SKU %s' % url)

            return

        names = hxs.select(u'//div[@id="product-page-body"]//h1/text()').extract()
        name = ' '.join(names)

        try:
            image_url = hxs.select(u'//div[@id="product-page-body"]//img[@class="img-responsive catalog-image"]/@src').extract()[0]
            image_url = urljoin_rfc(base_url, image_url)
        except:
            image_url = u''

        price = hxs.select('//div[@id="product-page-body"]//span[@class="catalog-price"]/text()').re(u'\xa3([\d\.,]+)')
        if not price:
            price = hxs.select('//div[@id="product-page-body"]//span[@class="product-price"]/text()').re(u'\xa3([\d\.,]+)')
        try:
            price = price[0]
        except:
            log.msg(">>> WARNING!!! NO PRICE >>> %s >>> %s" % (name, url))
            price = 0

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', url)
        loader.add_value('name', name)
        loader.add_value('sku', sku)
        loader.add_value('identifier', sku)
        loader.add_value('category', category)
        loader.add_value('image_url', image_url)
        loader.add_value('price', price)
        yield loader.load_item()
Exemplo n.º 7
0
    def parse_product(response):
        hxs = HtmlXPathSelector(response)

        product_name = ''.join(
            hxs.select('//div[@class="web_pro_detail_title"]/h1/text()').
            extract()).strip()
        identifier = hxs.select(
            '//*[@id="product_addtocart_form"]//input[@name="product"]/@value'
        ).extract()[0]
        sku = hxs.select(
            '//span[@class="sku_block"]/text()').extract()[0].strip()
        img = hxs.select('//div[@class="pro_img"]//img/@src').extract()
        category = response.meta.get('category')
        price = hxs.select('//span[@id="product-price-{}"]/text()'.format(
            identifier)).extract()
        if not price:
            price = hxs.select(
                '//*[@id="product_addtocart_form"]//span[@class="price"]'
            ).extract()
        price = extract_price(price[0])

        sizes = hxs.select(
            '//select[@class=" product-custom-option sizecc"]/option')
        if len(sizes) > 1:
            size_variations = []
            for size in sizes[1:]:
                size_id = size.select('./@value').extract()[0]
                size_name = size.select('./text()').extract()[0]
                size_variations.append([size_id, size_name])
            colors = hxs.select('//div[@id="colour_options_hidden"]//img')
            color_variations = []
            for color in colors:
                color_id = color.select('./@valueid').extract()[0]
                color_name = color.select('./@val').extract()[0]
                color_variations.append([color_id, color_name])
            options = itertools.product(size_variations, color_variations)

            for option in options:
                product_identifier = identifier + '_' + option[0][
                    0] + '_' + option[1][0]
                size_name = option[0][1]
                result = re.findall(r"(?sim)\( \+\$([\d.]+)\)", size_name)
                if result:
                    add_price = extract_price(result[0])
                    size_name = size_name.replace('( +${})', '').strip()
                else:
                    add_price = extract_price('0')
                name = product_name + ' ' + size_name + ' ' + option[1][1]
                loader = ProductLoader(item=Product(), selector=hxs)
                loader.add_value('identifier', product_identifier)
                loader.add_value('sku', sku)
                loader.add_value('url', response.url)
                loader.add_value('name', name)
                loader.add_value('price', price + add_price)
                if img:
                    loader.add_value(
                        'image_url', urljoin_rfc(get_base_url(response),
                                                 img[0]))
                loader.add_value('category', category)
                yield loader.load_item()
        else:
            colors = hxs.select('//div[@id="colour_options_hidden"]//img')
            if colors:
                for color in colors:
                    color_id = color.select('./@valueid').extract()[0]
                    color_name = color.select('./@val').extract()[0]
                    product_identifier = identifier + color_id
                    name = product_name + ' ' + color_name
                    loader = ProductLoader(item=Product(), selector=hxs)
                    loader.add_value('identifier', product_identifier)
                    loader.add_value('sku', sku)
                    loader.add_value('url', response.url)
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    if img:
                        loader.add_value(
                            'image_url',
                            urljoin_rfc(get_base_url(response), img[0]))
                    loader.add_value('category', category)
                    yield loader.load_item()
            else:
                loader = ProductLoader(item=Product(), selector=hxs)
                loader.add_value('identifier', identifier)
                loader.add_value('sku', sku)
                loader.add_value('url', response.url)
                loader.add_value('name', product_name)
                loader.add_value('price', price)
                if img:
                    loader.add_value(
                        'image_url', urljoin_rfc(get_base_url(response),
                                                 img[0]))
                loader.add_value('category', category)
                yield loader.load_item()
Exemplo n.º 8
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        image_url = hxs.select(
            '//span[@class="mainimage"]//img/@src').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
        product_identifier = hxs.select('//script/text()').re(
            'var product_id *= *(.+);')
        if not product_identifier:
            yield Request(response.url, dont_filter=True)
            return
        product_identifier = product_identifier[0]
        product_name = hxs.select(
            '//h1[@itemprop="name"]/text()').extract()[0].strip()
        category = hxs.select(
            '//div[@id="breadcrumbs"]//a/text()').extract()[1:]
        brand = re.findall("'brand': '(.*)',", response.body)
        brand = brand[0].strip() if brand else ''
        product_price = hxs.select('//script/text()').re(
            "'price' *: *'(.+?)'")[0]
        product_price = extract_price(product_price)
        sku = hxs.select('//span[@class="mpn"]//text()').re(
            'Product code: *(.+)')

        options = []
        product_options = hxs.select('//div[@class="ctaselector"]')
        if product_options:
            for select in product_options:
                values = select.select('.//li/a/@id').extract()
                titles = select.select('.//li/a/span/text()').extract()
                opts = []
                for value, title in zip(values, titles):
                    opts.append({'identifier': value, 'name': title})
                if opts:
                    options.append(opts)

        if options:
            for opts in itertools.product(*options):
                name = product_name
                identifier = product_identifier
                for option in opts:
                    name += ' ' + option['name']
                    identifier += '_' + option['identifier']
                product_loader = ProductLoader(item=Product(), selector=hxs)
                product_loader.add_value('identifier', identifier)
                product_loader.add_value('name', name)
                if image_url:
                    product_loader.add_value('image_url', image_url)
                product_loader.add_value('price', product_price)
                if product_loader.get_output_value('price') < 50:
                    product_loader.add_value('shipping_cost', 3.95)
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', brand)
                product_loader.add_value('sku', sku)
                product_loader.add_value('category', category)
                product = product_loader.load_item()
                yield product
        else:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('name', product_name)
            if image_url:
                product_loader.add_value('image_url', image_url)
            product_loader.add_value('price', product_price)
            if product_loader.get_output_value('price') < 50:
                product_loader.add_value('shipping_cost', 3.95)
            product_loader.add_value('url', response.url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('sku', sku)
            product_loader.add_value('category', category)
            product = product_loader.load_item()
            yield product
Exemplo n.º 9
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        image_url = hxs.select('//*[@id="zoomImage"]/@src').extract()
        product_identifier = hxs.select(
            '//input[@name="shopListrec"]/@value').extract()[0].strip()
        product_name = hxs.select(
            '//h1[contains(@class, "prodTitle")]/text()').extract()[0]
        category = hxs.select(
            '//p[@class="crumbsDN"]//a/span/text()').extract()
        brand = response.meta.get('brand')
        price = Decimal(
            response.xpath(
                '//meta[@itemprop="price"]/@content').extract_first())
        sku = hxs.select('//span[@itemprop="productID"]/text()').extract()[0]
        out_of_stock = hxs.select('//*[@id="errorMsg"]/text()').extract()

        options = hxs.select('//*[@id="options"]//select/option')
        if len(options) > 1:
            self.log('More options!! {}'.format(response.url))
        if options:
            for option in options[1:]:
                identifier = option.select('./@value').extract()[0]
                option_name = option.select('./text()').extract()[0].split(
                    '-')[0].strip()
                out_of_stock = option.select('./@disabled').extract()
                product_loader = ProductLoader(item=Product(), selector=hxs)
                product_loader.add_value('identifier',
                                         product_identifier + '_' + identifier)
                product_loader.add_value('name',
                                         product_name + ' ' + option_name)
                if image_url:
                    product_loader.add_value(
                        'image_url', urljoin_rfc(base_url, image_url[0]))

                product_loader.add_value('price', price)
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', brand)
                product_loader.add_value('category', category)
                product_loader.add_value('sku', sku)
                if price < 75:
                    product_loader.add_value('shipping_cost', 3.99)
                if out_of_stock:
                    product_loader.add_value('stock', 0)
                product = product_loader.load_item()
                yield product
        else:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('name', product_name)
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            product_loader.add_value('price', price)
            product_loader.add_value('url', response.url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            product_loader.add_value('sku', sku)
            if price < 75:
                product_loader.add_value('shipping_cost', 3.99)
            if out_of_stock and out_of_stock == 'OUT OF STOCK':
                product_loader.add_value('stock', 0)
            product = product_loader.load_item()
            yield product
Exemplo n.º 10
0
 def parse(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     for url in hxs.select('//*[@id="modelLevelHub"]/li/a/@href').extract():
         yield Request(urljoin_rfc(base_url, url),
                       callback=self.parse_products)
Exemplo n.º 11
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        hxs = HtmlXPathSelector(response)

        for url in hxs.select(
                '//div[@class="stretch clearfix box"]/select/option/@value'
        ).extract():
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_product,
                          meta={'dont_redirect': True})
        loader = ProductLoader(item=Product(), selector=hxs)
        name = hxs.select(
            '//*[@id="centerC"]/h1/span[@itemprop="name"]/text()').extract()[0]
        loader.add_value('name', name)
        identifier = hxs.select(
            '//div[@class="pd-container-right"]//form[@class="addBasketItem"]//input[@name="productId"]/@value'
        ).extract()
        if not identifier:
            return
        loader.add_value('identifier', identifier[0])
        loader.add_value('url', response.url)
        price = hxs.select('//noscript/span/text()').extract()
        price = extract_price(price[0]) if price else '0'
        loader.add_value('price', price)
        stock = hxs.select('//*[@id="first3"]/p/span/text()').extract()
        stock = stock[0] if stock else ''
        categories = hxs.select(
            '//*[@id="infoblock"]/div/a/text()').extract()[1:]
        for category in categories:
            loader.add_value('category', category)
        brand = hxs.select('//div[@class="pd-brand box"]/a/img/@alt').extract()
        brand = brand[0] if brand else ''
        loader.add_value('brand', brand)
        image_url = hxs.select('//*[@id="showPic"]/@src').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
        loader.add_value('image_url', image_url)
        product = loader.load_item()

        promotion_price = hxs.select(
            u'//p[contains(text(), "Preço Regular")]/strike/text()').re(
                r'[\d,.]+')
        metadata = SonaeMeta()
        metadata['exclusive_online'] = 'No'
        if promotion_price:
            metadata['promotion_price'] = promotion_price[0].replace(
                '.', '').replace(',', '.')
        metadata['stock'] = stock

        if self.meta_df is not None and not self.meta_df.empty and identifier[
                0] in self.meta_df.index:
            prev_meta = self.meta_df.loc[identifier[0]]
        else:
            prev_meta = {}
        promo = promotion_price
        promo_start = prev_meta.get('promo_start')
        promo_end = prev_meta.get('promo_end')
        today = datetime.datetime.now().strftime('%Y-%m-%d')

        metadata['extraction_timestamp'] = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M')
        if promo:
            metadata[
                'promo_start'] = promo_start if promo_start and not promo_end else today
            metadata['promo_end'] = ''
        else:
            if promo_start:
                metadata['promo_start'] = promo_start
                metadata['promo_end'] = today if not promo_end else promo_end

        product['metadata'] = metadata

        shipping_pid = hxs.select(
            '//span[@id="shipmentDetails"]/@data-productid').extract()
        if shipping_pid:
            shipping_url = 'https://www.redcoon.pt/req/ajax/mod/ShopShipment/pid/' + shipping_pid[
                0]
            headers = {
                'X-Requested-With': 'XMLHttpRequest',
            }
            yield Request(shipping_url,
                          headers=headers,
                          callback=self.parse_shipping,
                          meta={'product': product})
        else:
            yield product
Exemplo n.º 12
0
 def parse_item(self, response):
     for match in re.finditer(r'<a class=\\"hint\\" title=\\"Weitere Informationen zum Produkt\\" href=\\"\\/(.*?)\\"', response.body):
         url = match.group(1)
         yield Request(urljoin_rfc('http://www.cyberport.de/', url), callback=self.parse_product)
Exemplo n.º 13
0
 def parse_product_list(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     products = hxs.select('//div[contains(@class, "product producttile")]')
     for product in products:
         product_loader = ProductLoader(item=Product(), selector=product)
         product_name = product.select(
             './/div[@class="name"]/a/text()').extract()
         if not product_name:
             continue
         else:
             product_name = product_name[0].strip()
         image_url = product.select(
             './/img[@id="firimg"]/@src').extract()[0]
         product_loader.add_value('image_url',
                                  urljoin_rfc(base_url, image_url))
         product_loader.add_value('name', product_name)
         url = product.select('.//div[@class="name"]/a/@href').extract()[0]
         product_loader.add_value('url', url)
         match = re.search(r"(\d+)\.html", url)
         if match:
             identifier = match.group(1)
             if identifier in self.ids:
                 continue
             else:
                 self.ids.append(identifier)
         else:
             continue
         product_loader.add_value('identifier', identifier)
         product_loader.add_value('sku', identifier)
         price = product.select(
             './/div[@class="salesprice"]/text()').extract()[0]
         product_loader.add_value('price', extract_price(price))
         category = response.meta.get('category', '')
         if not category or response.meta.get('full'):
             category2 = product.select(
                 './/div[@class="capacityType"]/text()').extract()
             if category2:
                 category2 = category2[0].split(u'\u2022')
                 if len(category2) > 1:
                     category2 = category2[1]
                 else:
                     category2 = category2[0]
                 if category2.strip():
                     category = category2
         if category:
             # Diageo have requested that we group all categories with 'Whisky' or 'Whiskey' in the name into one category named 'Whisky'
             # https://www.assembla.com/spaces/competitormonitor/tickets/2254
             if 'whisky' in category.lower() or 'whiskey' in category.lower(
             ):
                 category = 'Whisky'
             product_loader.add_value('category', category.strip())
         for brand in self.brands:
             if brand in product_name.replace('A&J',
                                              'Alexander & James').replace(
                                                  u'C\xeeroc', 'Ciroc'):
                 product_loader.add_value('brand', brand)
                 break
         product = product_loader.load_item()
         self.jar_counter += 1
         yield Request(url,
                       callback=self.parse_product,
                       meta={
                           'product': product,
                           'cookiejar': self.jar_counter
                       },
                       cookies={})
Exemplo n.º 14
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        image_url = hxs.select('//*[@id="image"]/@src').extract()
        try:
            product_identifier = hxs.select(
                '//input[@name="product"]/@value').extract()[0].strip()
        except:
            product_identifier = hxs.select(
                '//form[@id="product_addtocart_form"]/@action').re(
                    r'/product/(\d+)')[0]
        product_name = hxs.select('//*[@id="productname"]/text()').extract()[0]
        category = hxs.select(
            '//div[@class="breadcrumbs"]//a/text()').extract()[1:]
        sku = product_identifier

        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            attributes = {}
            for attr_id, attr in product_data['attributes'].iteritems():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' '.join(
                            (products.get(product, ''), option['label']))
                        attributes.setdefault(product, []).append({
                            'attr_id':
                            attr_id,
                            'val':
                            option['id']
                        })

            for identifier, option_name in products.iteritems():
                product_loader = ProductLoader(item=Product(), selector=hxs)
                product_loader.add_value('identifier',
                                         product_identifier + '_' + identifier)
                product_loader.add_value('name', product_name + option_name)
                if image_url:
                    product_loader.add_value(
                        'image_url', urljoin_rfc(base_url, image_url[0]))
                product_loader.add_value('url', response.url)
                product_loader.add_value('category', category)
                product_loader.add_value('sku', sku)
                product_loader.add_value('brand', response.meta.get('brand'))

                product = product_loader.load_item()
                form_data = {'product': product_identifier, 'billing_qty': '1'}
                for attr in attributes[identifier]:
                    form_data['super_attribute[{}]'.format(
                        attr['attr_id'])] = str(attr['val'])
                yield FormRequest(
                    url='http://www.musclefood.com/billing/ajax/servingsinfo/',
                    formdata=form_data,
                    meta={'product': product},
                    callback=self.parse_price,
                    dont_filter=True)

        else:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('name', product_name)
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            price = ''.join(
                hxs.select('//span[@class="price"]/text()').extract()).strip()
            price = extract_price(price)
            product_loader.add_value('price', price)
            product_loader.add_value('url', response.url)
            product_loader.add_value('category', category)
            product_loader.add_value('sku', sku)
            product_loader.add_value('brand', response.meta.get('brand'))
            if price < 75:
                product_loader.add_value('shipping_cost', 3.95)
            product = product_loader.load_item()
            yield product
Exemplo n.º 15
0
    def parse_product(response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        discount = url_query_parameter(response.url, 'qbDiscount')

        match = re.search("var productData = (.*?)</script>", response.body,
                          re.DOTALL | re.IGNORECASE)
        if match:
            result = match.group(1)
            prod_name = hxs.select(
                '//*[@id="wrapper_page_content"]//h1/text()').extract()[0]
            sku = hxs.select(
                '//*[@id="product_tab_1"]//li[@class="product_code"]/span/text()'
            ).extract()[0]
            image_url = hxs.select(
                '//*[@id="product_view_full"]/@href').extract()
            category = hxs.select(
                '//*[@id="nav_breadcrumb"]//a/span/text()').extract()[1:]
            product_identifier = hxs.select(
                '//*[@id="productId"]/@value').extract()[0]
            options_prices = demjson.decode(result)
            options_prices = options_prices['items']
            options = hxs.select('//*[@id="product_size_full"]/option')[1:]
            for option, price in zip(options, options_prices):
                product_loader = ProductLoader(item=Product(), selector=hxs)
                name = option.select('./text()').extract()[0]
                identifier = option.select('./@value').extract()[0]
                product_loader.add_value('identifier',
                                         product_identifier + '_' + identifier)
                product_loader.add_value('name', prod_name + ' ' + name)
                product_loader.add_value('sku', sku)
                if image_url:
                    product_loader.add_value(
                        'image_url', urljoin_rfc(base_url, image_url[0]))
                price = price['nowprice']
                price = extract_price(str(price))
                if discount:
                    price = round(
                        price - decimal.Decimal(int(discount) / 100.0) * price,
                        2)
                product_loader.add_value('price', price)
                product_loader.add_value(
                    'category',
                    transform_category(product_loader.get_output_value('name'),
                                       category))
                product_loader.add_value('url', response.url)
                out_of_stock = option.select('./@disabled').extract()
                if out_of_stock:
                    product_loader.add_value('stock', 0)
                product = product_loader.load_item()
                yield product
        else:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            name = hxs.select(
                '//*[@id="wrapper_page_content"]//h1/text()').extract()
            if not name:
                return
            identifier = hxs.select('//*[@id="productId"]/@value').extract()
            if not identifier:
                return
            product_loader.add_value('identifier', identifier[0])
            product_loader.add_value('name', name[0])
            sku = hxs.select(
                '//*[@id="product_tab_1"]//li[@class="product_code"]/span/text()'
            ).extract()[0]
            product_loader.add_value('sku', sku)
            image_url = hxs.select(
                '//*[@id="product_view_full"]/@href').extract()
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            price = hxs.select(
                '//ul[@class="product_summary"]/li[@class="product_price"]/span/text()'
            ).extract()[0]
            price = extract_price(price)
            if discount:
                price = round(
                    price - decimal.Decimal(int(discount) / 100.0) * price, 2)
            product_loader.add_value('price', price)
            category = hxs.select(
                '//*[@id="nav_breadcrumb"]//a/span/text()').extract()[1:]
            product_loader.add_value(
                'category',
                transform_category(product_loader.get_output_value('name'),
                                   category))
            product_loader.add_value('url', response.url)
            product = product_loader.load_item()
            yield product
Exemplo n.º 16
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     for url in hxs.select('//figure[@class="produit-image"]/a/@href').extract():
         yield Request(urljoin_rfc(base_url, url),
                       callback=self.parse_product)
Exemplo n.º 17
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        product_identifier = hxs.select('//@prodid').extract()
        if product_identifier:
            product_identifier = product_identifier[0].strip()
        else:
            self.retry(response, "Cant find identifier on " + response.url)
            return
        image_url = hxs.select(
            '//div[@class="prod-images-container"]//img/@src').extract()
        product_name = hxs.select(
            '//div[@class="product-name"]//h1/text()').extract()[0].strip()
        category = hxs.select('//*[@id="bcrumb"]/a[2]/text()').extract()
        category = category[0].strip() if category else ''
        brand = hxs.select(
            '//div[contains(@class, "product-brand-icon")]/a/img/@alt'
        ).extract()
        brand = brand[0].strip() if brand else ''
        #uname = hxs.select('//input[@id="hdnUName"]/@value').extract()[0]
        option_title = response.css('.prod-selector-section-hdr').xpath(
            'text()').extract_first()
        size_or_color = 'Color' in option_title or 'Size' in option_title
        options = response.xpath('//div[@id="dropdownSkuCtrl_nojs"]//option')
        js_options = response.xpath(
            '//script[contains(., "arrSku")]/text()').re('({.+?});')
        js_options = {
            demjson.decode(opt)['skuID']: demjson.decode(opt)
            for opt in js_options
        }
        for option in options:
            product_loader = ProductLoader(item=Product(), selector=option)
            sku = option.select('./@skucode').extract()
            if not sku:
                continue
            sku = sku[0]
            product_loader.add_value('sku', sku)
            identifier = option.select('./@value').extract()[0]
            if identifier not in js_options:
                continue
            product_loader.add_value('identifier',
                                     product_identifier + '_' + identifier)
            option_name = js_options[identifier]['color'].title(
            ) + ', ' + js_options[identifier]['size'].title()
            if option_name.startswith(', '):
                option_name = option_name[2:]
            if option_name.endswith(', '):
                option_name = option_name[:-2]
            if not option_name:
                if size_or_color:
                    continue
                option_name = option.select('./text()').extract()[0].strip()
            product_loader.add_value('name', product_name + ', ' + option_name)
            attcolor = option.select('./@attcolor').extract()
            if attcolor:
                attcolor = attcolor[0]
                image_url = hxs.select('//div[@attcolor="{}"]/@lrgimg'.format(
                    attcolor)).extract()
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            price = extract_price(option.select('./@adjdefprice').extract()[0])
            product_loader.add_value('price', price)
            product_loader.add_value('url', response.url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            skustatus = option.select('./@skustatus').extract()[0]
            if skustatus == 'INVUVL':
                product_loader.add_value('stock', 0)
            product = product_loader.load_item()

            prcoff = extract_price(''.join(
                option.select('./@prcoff').extract()))
            rrp = option.select('./@retailprice').extract()
            rrp = extract_price(rrp[0])
            rrp = str(rrp) if rrp > price else ''
            metadata = CRCMeta()
            metadata['rrp'] = rrp
            product['metadata'] = metadata

            yield product
Exemplo n.º 18
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        image_url = hxs.select('//img[@style="cursor : hand;"]/@src').extract()
        category = response.meta.get('category_name')
        brand = response.meta.get('brand', '')
        rrp = ''

        product_rows = hxs.select('//*[@id="item_Tbl"]//tr')

        first_row = True
        product_name = None
        for row in product_rows:
            name = ''.join(row.select('./td[1]//text()').extract()).strip()
            if name == 'Name:':
                if not first_row:
                    product_loader.add_value('name', product_name)
                    product_loader.add_value('url', response.url)
                    product_loader.add_value('category', category)
                    product_loader.add_value('brand', brand)
                    if image_url:
                        product_loader.add_value(
                            'image_url', urljoin_rfc(base_url, image_url[0]))
                    product = product_loader.load_item()
                    metadata = CRCMeta()
                    metadata['rrp'] = rrp
                    product['metadata'] = metadata
                    yield product
                first_row = False
                product_loader = ProductLoader(item=Product(), selector=hxs)
                product_name = ''.join(
                    row.select('./td[2]//text()').extract()).strip()
            if name and name not in ('Name:', 'Price:', 'Product Code:',
                                     'Availability:', 'Qty:'):
                product_name += ' - ' + ''.join(
                    row.select('./td[2]//text()').extract()).strip()
            if name == 'Price:':
                price = row.select('./td[2]//text()').extract()[0]
                rrp = str(
                    extract_price(''.join(
                        row.select('./td[2]//text()').re(r'RRP (.*)'))))
                product_loader.add_value('price', extract_price(price))
            if name == 'Product Code:':
                sku = row.select('./td[2]//text()').extract()[0]
                product_loader.add_value('identifier', sku)
                product_loader.add_value('sku', sku)
            if name == 'Availability:':
                stock = row.select('./td[2]//text()').extract()[0]
                if 'In stock' not in stock:
                    product_loader.add_value('stock', 0)
        if product_name:
            product_loader.add_value('name', product_name)
            product_loader.add_value('brand', brand)
            product_loader.add_value('url', response.url)
            product_loader.add_value('category', category)
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            product = product_loader.load_item()
            metadata = CRCMeta()
            metadata['rrp'] = rrp
            product['metadata'] = metadata
            yield product
Exemplo n.º 19
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        parsed_ids = []

        cats = hxs.select('//*[@id="sidebar"]//li[@class="sections-list"]/a/@href').extract()
        if cats:
            for cat in cats:
                yield Request(
                    url=urljoin_rfc(base_url, cat)
                )

        subcats = hxs.select('//*[@id="ContentPage"]//span[@class="boxheading"]/a/@href').extract()
        if subcats:
            for subcat in subcats:
                yield Request(
                    url=urljoin_rfc(base_url, subcat)
                )

        for url in hxs.select('//div[@id="content"]//form//a[@class="read-more"]/@href').extract():
            if url:
                try:
                    pid = url.split('.')[-2].split('-')[-1]
                    if pid not in parsed_ids:
                        parsed_ids.append(pid)
                    yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
                except:
                    pass

        for url in hxs.select('//div[@id="content"]//form//h1/../../a/@href').extract():
            if url:
                try:
                    pid = url.split('.')[-2].split('-')[-1]
                    if pid not in parsed_ids:
                        parsed_ids.append(pid)
                    yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
                except:
                    pass

        category = hxs.select('//*[@id="ContentPage"]/p[@class="text_breadcrumbs"]/a/text()').extract()[1:]

        for product in hxs.select('//*[@id="ContentPage"]//div[@class="col-xs-12"]'):
            pid = product.select('.//input[contains(@name, "Q_")]/@name').re(r'Q_(.+)')
            if pid:
                pid = pid[0]
                if pid not in parsed_ids:
                    parsed_ids.append(pid)
                    name = ''.join(product.select('.//h1//text()').extract())
                    url = response.url
                    image_url = product.select('.//img[@class="catalog-image"]/@src').extract()
                    image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
                    price = product.select('.//span[@class="catalog-price" or @class="product-price"]/text()').re(u'\xa3([\d\.,]+)')
                    price = price[0]
                    loader = ProductLoader(item=Product(), selector=product)
                    loader.add_value('url', url)
                    loader.add_value('name', name)
                    loader.add_value('sku', pid)
                    loader.add_value('identifier', pid)
                    loader.add_value('category', category)
                    loader.add_value('image_url', image_url)
                    loader.add_value('price', price)
                    yield loader.load_item()
Exemplo n.º 20
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        options = None
        js_line = ''
        for l in response.body.split('\n'):
            if 'variants:' in l:
                js_line = l
                break

        if js_line:
            options = demjson.decode(
                re.search(r'variants:(.*};)?',
                          js_line).groups()[0][:-2].strip())

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_identifier = hxs.select(
            '//input[@id="productId" or @name="productId"]/@value').extract(
            )[0]
        product_loader.add_value('identifier', product_identifier)
        product_loader.add_value('url', response.url)
        name = hxs.select('//span[@itemprop="name"]/text()').extract()[0]
        product_loader.add_value('name', name)
        category = hxs.select(
            '//*[@id="breadcrumb"]//a/text()').extract()[1:-1]
        product_loader.add_value('category', category)
        product_loader.add_xpath('sku',
                                 '//span[@class="pd_productVariant"]/text()')
        img = hxs.select('//meta[@property="og:image"]/@content').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img.pop()))
        price = hxs.select(
            '//p[@class="productOfferPrice"]/text()').extract()[0]
        price = extract_price(price)
        product_loader.add_value('price', price)
        brand = hxs.select('//*[@id="brandHeader"]/a/@href').extract()
        if brand:
            brand = brand[0].replace('/en/', '')[:-1]
            product_loader.add_value('brand', brand)
        stock = ''.join(
            hxs.select(
                '//div[@class="cvos-availbility-panel"]/p/text()').extract())
        if 'Item is currently out of stock online' in stock:
            product_loader.add_value('stock', 0)
        product = product_loader.load_item()
        metadata = FragranceDirectMeta()
        prom = ''.join(
            hxs.select('//div[@class="productSavings"]//text()').extract())
        metadata['promotion'] = prom + ' ' + ''.join(
            hxs.select('//div[@class="primaryItemDeal"]//p/text()').extract())
        if product['price']:
            metadata['price_exc_vat'] = Decimal(
                product['price']) / Decimal('1.2')
        product['metadata'] = metadata

        yield product

        if options:
            for k, val in options.items():
                option_name = k.replace('_', ' ')
                option_product = Product(product)
                option_product['name'] = product['name'] + ' ' + option_name
                option_product['sku'] = val['productCode']
                option_product['identifier'] = val['variantId']
                option_product['price'] = extract_price(val['nowPrice'])
                if option_product.get('price'):
                    option_product['metadata']['price_exc_vat'] = Decimal(
                        option_product['price']) / Decimal('1.2')

                yield option_product
Exemplo n.º 21
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        price = response.xpath('//script/text()').re('ecomm_totalvalue:(.*),')
        if not price:
            price = response.css('.itemPrice b::text').extract(
            ) or response.xpath('//span[@itemprop="price"]/text()').extract()
        brand = response.css('.brandImage ::attr(alt)').extract()
        id_text = response.xpath(
            '//td[@class="itemActions"]/a/@onclick').extract()[0]
        match = re.search(
            r"slrhutAdd.*?ToCart\('(.*?)','(.*?)','(.*?)','.*?'\);", id_text,
            re.DOTALL | re.IGNORECASE | re.MULTILINE)
        if match:
            product_identifier = "{}_{}_{}".format(match.group(1),
                                                   match.group(2),
                                                   match.group(3))
        else:
            self.log(
                'ERROR: Could not parse product identifier, URL: {}'.format(
                    response.url))
            return

        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        if not image_url:
            image_url = hxs.select(
                '//td[@class="itemImage"]/img/@src').extract()
        product_name = response.xpath('//p[@itemprop="name"]/text()').extract()
        if not product_name:
            product_name = hxs.select(
                '//p[@class="itemTitleHeader"]/a/text()').extract()
        product_name = product_name[0].strip()
        category = response.xpath(
            '//p[@class="mainPageHeader"]//a/text()').extract()

        in_stock = response.xpath(
            '//div[@class="itemContentAvailable"]//span/text()').extract()
        stock = True
        if in_stock and 'out of stock' in in_stock[0].lower():
            stock = False

        sku = response.xpath('//span[@itemprop="sku"]//text()').extract()
        if not sku:
            sku = hxs.select(
                '//div[@class="itemContentMPN" and contains(., "SKU")]//text()'
            ).extract()
        sku = sku[-1] if sku else ''

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('identifier', product_identifier)
        product_loader.add_value('name', product_name)
        if image_url:
            product_loader.add_value('image_url',
                                     urljoin_rfc(base_url, image_url[0]))
        product_loader.add_value('sku', sku)
        product_loader.add_value('price', price)
        product_loader.add_value('url', response.url)
        product_loader.add_value('category', category)
        product_loader.add_value('brand', brand)
        if not stock:
            product_loader.add_value('stock', 0)
        product = product_loader.load_item()
        mpn = response.xpath(
            '//div[@class="itemContentMPN" and contains(., "MPN")]//text()'
        ).extract()
        mpn = mpn[-1] if mpn else ''
        upc = response.xpath(
            '//div[@class="itemContentMPN" and contains(., "UPC")]//text()'
        ).extract()
        upc = upc[-1] if upc else ''
        if mpn or upc:
            metadata = EServiceGroupMeta()
            if mpn:
                metadata['mpn'] = mpn
            if upc:
                metadata['upc'] = upc
            product['metadata'] = metadata
        yield product
Exemplo n.º 22
0
    def parse_categories(self, response):
        base_url = get_base_url(response)

        categories = response.xpath(
            "//div[@class='breadcrumb']/a/text()").extract()
        urls = response.xpath(
            '//td[@class="cat_list_title"]/a/@href').extract()
        urls += response.xpath('//div[@id="sidebar"]//a/@href').extract()
        for url in urls:
            yield Request(urljoin_rfc(base_url, url + '/show/all'),
                          callback=self.parse_categories,
                          meta=response.meta)

        products = response.xpath(
            '//ul[contains(@class, "products-grid")]/li[contains(@class, "item")]'
        )
        if products:
            for product in products:
                full_range = product.xpath(
                    './/a[contains(text(), "Full Range")]')
                if full_range:
                    continue
                name = product.xpath(
                    './/h2[@class="product-name"]/text()').extract()
                if not name:
                    continue
                name = name[0]
                loader = ProductLoader(item=Product(), selector=product)
                loader.add_value('name', name)
                sku = product.xpath(
                    './/div[@class="product-description"]/p[not(contains(text(), "Stock"))]/text()'
                ).extract()
                if not sku:
                    continue
                loader.add_value('sku', sku)
                loader.add_value('identifier', sku)
                url = product.xpath(
                    './/a[@class="product-line"]/@href').extract()
                loader.add_value('url', url)
                image_url = product.xpath(
                    './/div[@class="product-image"]/img/@src').extract()
                if image_url:
                    loader.add_value('image_url',
                                     urljoin_rfc(base_url, image_url[0]))
                brand = response.meta.get('brand', '')
                loader.add_value('brand', brand)
                out_of_stock = product.xpath(
                    './/p[@class="availability out-of-stock"]').extract()
                if out_of_stock:
                    loader.add_value('stock', 0)
                price = product.xpath(
                    './/span[@class="regular-price"]/span[@class="price"]/text()'
                ).extract()
                if not price:
                    price = product.xpath(
                        './/p[@class="special-price"]/span[@class="price"]/text()'
                    ).extract()
                if not price and out_of_stock:
                    continue
                price = extract_price(price[0])
                loader.add_value('price', price)
                yield loader.load_item()
Exemplo n.º 23
0
    def parse_products_list(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        for url in hxs.select(
                '//div[@class="category filterCategory"]//li/a/@href').extract(
                ):
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_products_list)

        urls = hxs.select(
            '//div[@class="row productList"]//div[@class="product"]/div[@class="productContent"]/a/@href'
        ).extract()
        self.log('Product # found: {}'.format(len(urls)))
        if len(urls) == 0:
            self.log('Retry: {}'.format(response.url))
            retry = int(response.meta.get('retry', 0))
            retry += 1
            if retry < 10:
                yield Request(response.url,
                              meta={'retry': retry},
                              dont_filter=True,
                              callback=self.parse_products_list)
            else:
                self.log("No products found on the page on %s" % response.url)
                #self.retry_links.append(response.url)
            return
        for url in urls:
            yield Request(urljoin_rfc(base_url, url),
                          dont_filter=True,
                          callback=self.parse_product,
                          cookies={},
                          meta={'dont_merge_cookies': True})

        #next pages
        if 'pageSize=' not in response.url:
            products_count = int(
                hxs.select(
                    '//div[@id="plpContent"]/div[@id="searchCounter"]/text()').
                re('"productCount".*?(\d+)')[0])
            self.log('products_count: {}'.format(products_count))
            if products_count > 12:
                next_p = hxs.select(
                    '//div[@class="facetJSON hide"]/@data-url').extract()
                if next_p:
                    next_p = next_p[0]
                    self.log('Next P: {}'.format(next_p))
                    for i in xrange((products_count - 1) / 12):
                        begin_index = (i + 1) * 12
                        url = add_or_replace_parameter(next_p, 'beginIndex',
                                                       str(begin_index))
                        formdata = {
                            'beginIndex': str(begin_index),
                            'scrollTo': 'false',
                            'requesttype': 'ajax'
                        }
                        yield FormRequest(url,
                                          formdata=formdata,
                                          callback=self.parse_products_list,
                                          cookies={},
                                          meta={'dont_merge_cookies': True})
                else:
                    self.log('Retry next page: {}'.format(response.url))
                    retry = int(response.meta.get('retry', 0))
                    retry += 1
                    if retry < 10:
                        yield Request(response.url,
                                      meta={'retry': retry},
                                      dont_filter=True,
                                      callback=self.parse_products_list)
                    else:
                        self.log("No next page found on the page on %s" %
                                 response.url)
Exemplo n.º 24
0
 def parse(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     for url in hxs.select('//div[@class="list-item "]//h3/a/@href').extract():
         yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
Exemplo n.º 25
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        if hxs.select('//p[contains(text(), "OOPS! Page not found.")]'):
            return

        product_name = hxs.select(
            '//h1[@class="product_title"]/text()').extract()[0].strip()
        category = hxs.select(
            '//ul[@class="pb-breadcrumb"]/li[2]/a/text()').extract()
        if category:
            category = category[0].strip()
            if category == 'Outlet Store':
                category = hxs.select(
                    '//ul[@class="pb-breadcrumb"]/li[3]/a/text()').extract(
                    )[0].strip()
        else:
            category = ''
        brand = response.meta.get('brand', '')
        image_url = hxs.select('//img[@id="image1"]/@src').extract()[0]
        image_url = image_url.rpartition('/')[0] + '/'
        sku = hxs.select('//span[@class="product_number"]/text()').extract(
        )[0].strip().replace('#', '')
        price = hxs.select('//span[@class="sale_price_val"]/text()').extract()
        if not price:
            price = hxs.select(
                '//span[@class="list_price_val"]/text()').extract()

        price = extract_price(price[0])

        rrp = ''.join(
            hxs.select(
                '//span[contains(@class, "msrp_price_va")]/text()').extract())
        if not rrp:
            rrp = ''.join(
                hxs.select('//span[contains(@class, "list_price_val")]/text()'
                           ).extract())

        rrp = extract_price(rrp)
        rrp = str(rrp) if rrp > price else ''

        product_config_reg = re.search(
            r'var productItems = (\[\s+\{.*\}\s+\])', response.body, re.DOTALL)
        if product_config_reg:
            json_string = product_config_reg.group(1)
            json_string = json_string.replace('\r',
                                              '').replace('\n', '').replace(
                                                  '\t', '').replace(' ', '')
            json_string = json_string.replace('},]',
                                              '}]').replace('\\', '\\\\')
            products = json.loads(json_string)
            for product in products:
                product_loader = ProductLoader(item=Product(), selector=hxs)
                identifier = product['itemId']
                product_loader.add_value('identifier', identifier)
                stock = product['inventoryNumber']
                if 'Avail.' in product[
                        'inVentoryMessage'] or 'OutofStock' in product[
                            'inVentoryMessage']:
                    stock = 0
                product_loader.add_value('stock', stock)
                image = product['mainImage']
                product_loader.add_value('image_url',
                                         urljoin_rfc(image_url, image))
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', brand)
                product_loader.add_value('sku', sku)
                product_loader.add_value('price', price)
                product_loader.add_value('category', category)
                color = product['color']
                name = product_name
                if color:
                    name += ', ' + color
                size = product['size']
                if len(size) > 0:
                    size = size[0]['longform']
                    name += ', {}'.format(size)
                product_loader.add_value('name', name)
                product = product_loader.load_item()
                metadata = CRCMeta()
                metadata['rrp'] = rrp
                product['metadata'] = metadata
                yield product
        else:
            self.log('WARNING!!! url: {}'.format(response.url))
Exemplo n.º 26
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_identifier = hxs.select(
            './/input[@name="product"]/@value').extract()[0]
        product_loader.add_value('identifier', product_identifier)
        product_loader.add_value('url', response.url)

        name = hxs.select(
            '//div[@class="product-name"]/h1/text()').extract()[0]
        product_loader.add_value('name', name)
        category = hxs.select(
            '//ul[@class="breadcrumbs"]/li/a/text()').extract()[1:-1]
        product_loader.add_value('category', category)
        product_loader.add_value('sku', product_identifier)
        img = hxs.select('//img[@id="main-img"]/@src').extract()
        if img:
            product_loader.add_value(
                'image_url', urljoin_rfc(get_base_url(response), img.pop()))

        price = hxs.select(
            '//form//p[@class="special-price"]/span[@class="price"]/text()'
        ).extract()
        if not price:
            price = hxs.select(
                '//form//span[@class="regular-price"]/span[@class="price"]/text()'
            ).extract()
        price = extract_price(price[0]) if price else 0
        product_loader.add_value('price', price)

        item = product_loader.load_item()

        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join(
                            (products.get(product, ''), option['label']))
                        prices[product] = prices.get(
                            product, 0) + extract_price(option['price'])

            for identifier, option_name in products.iteritems():
                product_loader = ProductLoader(item=Product(), selector=hxs)
                product_loader.add_value('identifier',
                                         item['identifier'] + '_' + identifier)
                product_loader.add_value('sku',
                                         item['identifier'] + '_' + identifier)
                product_loader.add_value('name',
                                         item['name'] + ' ' + option_name)
                product_loader.add_value('image_url', item['image_url'])
                price = item['price'] + prices[identifier]
                product_loader.add_value('price', price)
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', '')
                product_loader.add_value('category', category)
                option_item = product_loader.load_item()
                yield option_item

        else:
            yield item
Exemplo n.º 27
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        meta = response.meta.copy()
        categories_urls = response.xpath('//ul[@class="categoryList"]/li//a')
        for category in categories_urls:
            url = category.select('@href').extract()[0]
            name = category.select('text()').extract()[0].strip()
            if "/prl/results" not in url and 'webapp' not in url:
                url += "/prl/results"
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse,
                          meta={'category': name})

        products = response.xpath(
            '//table[@id="sProdList"]/tbody/tr[td[@class="productImage"]]')
        for product in products:
            try:
                identifier = product.select(
                    './/a[@class="sku"]/text()').extract()[0].strip()
                stock = int(
                    product.select(
                        './/td[@class="availability"]/input[@class="hVal"]/@value'
                    ).extract()[0])
                price = round(
                    Decimal(
                        product.css(
                            '.price input.hVal::attr(value)').extract()[0]), 2)
            except IndexError:
                continue
            if identifier in self.cache_data:
                product_cached = self.cache_data[identifier]
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('identifier', identifier)
                loader.add_value('name',
                                 product_cached['name'].decode('utf-8'))
                loader.add_value('url', product_cached['url'].decode('utf-8'))
                loader.add_value('sku', product_cached['sku'].decode('utf-8'))
                loader.add_value('category',
                                 product_cached['category'].decode('utf-8'))
                loader.add_value('image_url',
                                 product_cached['image_url'].decode('utf-8'))
                loader.add_value('brand',
                                 product_cached['brand'].decode('utf-8'))
                loader.add_value('price', price)
                loader.add_value('stock', stock)
                item = loader.load_item()

                try:
                    self.missing_urls.remove(item['url'])
                except ValueError:
                    pass

                yield item
            else:
                url = product.select(
                    './/a[@class="sku"]/@href').extract()[0].strip()
                url = url_query_cleaner(url)
                if url in self.missing_urls:
                    self.missing_urls.remove(url)
                yield Request(url, callback=self.parse_product, meta=meta)

        pages = response.css('.pages .pageIt a::attr(href)').extract()
        for url in pages:
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse,
                          meta=meta)

        if not products and not categories_urls:
            yield Request(url_query_cleaner(response.url),
                          dont_filter=True,
                          callback=self.parse_product,
                          meta=meta)
Exemplo n.º 28
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     for url in hxs.select('//*[@id="navigation"]//a/@href').extract()[1:]:
         yield Request(urljoin_rfc(base_url, url + '?_artperpage=100'),
                       callback=self.parse_products_list)
Exemplo n.º 29
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     for url in response.xpath('//section[@id="results"]//a[contains(@class, "productCard__title")]/@href').extract():
         yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)
Exemplo n.º 30
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        try:
            product_identifier = response.xpath(
                '//input[@name="product"]/@value').extract()[0].strip()
        except:
            product_identifier = response.xpath(
                '//form[@id="product_addtocart_form"]/@action').re(
                    r'/product/(\d+)')
            if not product_identifier:
                yield Request(response.url,
                              callback=self.parse_product,
                              dont_filter=True)
                return
            else:
                product_identifier = product_identifier[0]

        product_name = response.xpath(
            '//h2[@itemprop="name"]/text()').extract()[0]

        brand = response.meta.get('brand', '')

        category = 'Used Equipment'
        sku = response.xpath('//div[@class="quickfind"]/text()').extract()
        sku = sku[0].replace('Quick find', '').strip() if sku else ''
        price = response.xpath(
            '//*[@id="product-price-{}"]/div/span[@class="price"]/text()'.
            format(product_identifier)).extract()[0]
        price_pennies = response.xpath(
            '//*[@id="product-price-{}"]/div/span[@class="price"]/span[@class="price-pennies"]/text()'
            .format(product_identifier)).extract()
        if price_pennies:
            price += price_pennies[0]
        price = extract_price(price)
        cashback = response.xpath('//div[@class="cashback"]/text()').extract()
        if cashback:
            price += extract_price(cashback[0])
        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}
            for attr_id, attr in product_data['attributes'].iteritems():
                for option in attr['options']:
                    option_price = extract_price(option['price'])
                    for product in option['products']:
                        products[product] = ' '.join(
                            (products.get(product, ''), option['label']))
                        prices[product] = option_price

            for identifier, option_name in products.iteritems():
                product_loader = ProductLoader(item=Product(),
                                               response=response)
                product_loader.add_value('identifier',
                                         product_identifier + '_' + identifier)
                product_loader.add_value('name',
                                         product_name + ' ' + option_name)
                if image_url:
                    product_loader.add_value(
                        'image_url', urljoin_rfc(base_url, image_url[0]))
                product_loader.add_value('url', response.url)
                product_loader.add_value('category', category)
                product_loader.add_value('brand', brand)
                product_loader.add_value('sku', sku)
                product_loader.add_value('price', price + prices[identifier])
                product = product_loader.load_item()
                yield product
        else:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('name', product_name)
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            product_loader.add_value('url', response.url)
            product_loader.add_value('category', category)
            product_loader.add_value('brand', brand)
            product_loader.add_value('sku', sku)
            product_loader.add_value('price', price)
            product = product_loader.load_item()
            yield product