def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     product_loader = ProductLoader(item=Product(), selector=hxs)
     image_url = hxs.select('//div[@class="firstPic"]/a/img/@src').extract()
     product_identifier = hxs.select(
         '//div[@class="desc"]//img[contains(@src,"button_info")]/../@href'
     ).extract()[0]
     product_identifier = url_query_parameter(product_identifier, 'pID')
     product_name = hxs.select(
         '//div[@class="productInfo1"]/h1/text()').extract()[0].strip()
     product_loader.add_value('identifier', product_identifier)
     product_loader.add_value('name', product_name)
     if image_url:
         product_loader.add_value('image_url',
                                  urljoin_rfc(base_url, image_url[0]))
     price = hxs.select('//span[@class="productNewPrice"]/text()').extract()
     if not price:
         price = hxs.select('//span[@class="price"]/text()').extract()
     price = extract_price(price[0])
     sku_text = hxs.select('//p[@class="basicData"]//text()').extract()
     sku = ''
     for txt in sku_text:
         if 'Art.Nr.:' in txt:
             sku = txt.replace('Art.Nr.:', '').strip()
             break
     product_loader.add_value('sku', sku)
     product_loader.add_value('price', price)
     product_loader.add_value('url', response.url)
     category = hxs.select(
         '//*[@id="box_categories"]//li[@class="activeCat"]/a/text()'
     ).extract()
     product_loader.add_value('category', category)
     search_txt = ''.join(
         hxs.select('//div[@class="desc"]//text()').extract())
     match = re.search(r"Gewicht.*?(?::|kg)*.*?([\d,]+)", search_txt,
                       re.DOTALL | re.IGNORECASE)
     if match:
         try:
             weight = float(match.group(1).replace(',', '.'))
             if weight <= 3:
                 product_loader.add_value('shipping_cost', 4.90)
             elif weight <= 10:
                 product_loader.add_value('shipping_cost', 8.90)
             elif weight <= 19:
                 product_loader.add_value('shipping_cost', 13.90)
             elif weight <= 60:
                 product_loader.add_value('shipping_cost', 22.90)
             elif weight <= 100:
                 product_loader.add_value('shipping_cost', 29.90)
             elif weight <= 150:
                 product_loader.add_value('shipping_cost', 39.90)
             elif weight <= 220:
                 product_loader.add_value('shipping_cost', 42.90)
             elif weight > 220:
                 product_loader.add_value('shipping_cost', 49)
         except:
             pass
     in_stock = hxs.select(
         '//*[@id="cart_quantity"]//input[@name="products_qty"]')
     if not in_stock:
         product_loader.add_value('stock', 0)
     product = product_loader.load_item()
     yield product
예제 #2
0
    def parse_product_list(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        featured_product = hxs.select(u'//div[@class="featuredProduct"]')
        product_loader = ProductLoader(item=Product(),
                                       selector=featured_product)
        url = featured_product.select(
            u'.//div[@class="fDescription"]/a/@href').extract()
        if url:
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', (url.split(';')[0]).split('?')[0])
            product_loader.add_xpath(
                'name', u'.//div[@class="fDescription"]/a/strong/text()')
            identifier = featured_product.select(
                u'.//input[@name="/com/castorama/CastShoppingCartFormHandler.productId"]/@value'
            ).extract()
            if not identifier:
                identifier = featured_product.select(
                    './/div[@class="fIllustration"]//img/@productid').extract(
                    )
            if (identifier and not identifier[0].strip()) or not identifier:
                identifier = re.search(r'-([\w]*)\.html', url).groups()
            product_loader.add_value('identifier', identifier[0])
            try:
                product_loader.add_value('image_url',
                                         urljoin_rfc(get_base_url(response),
                                                     featured_product\
                                                     .select('.//div[@class="fIllustration"]//img/@src').extract()[0]
                                                     ))
            except:
                pass
            price_css_classes = [{
                'tag': 'span',
                'class': 'newprice'
            }, {
                'tag': 'div',
                'class': 'price'
            }]
            for price_css_class in price_css_classes:
                price = featured_product.select(
                    u'.//' + price_css_class['tag'] + '[@class="' +
                    price_css_class['class'] +
                    '"]/text()').re(u'([0-9\,\.\xa0]+)')
                if price:
                    price = price[0].replace(u'\xa0', '').replace(',', '.')
                    product_loader.add_value('price', price)
                    break
            # if not product_loader.get_output_value('price'):
            product_loader.add_value('stock', 1)
            yield product_loader.load_item()

        products = hxs.select(
            u'//div[contains(@class,"productsRow")]/div[contains(@class,"productItem")]'
        )
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(
                u'.//div[@class="prodDecription"]/a/@href').extract()
            if not url:
                continue
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', (url.split(';')[0]).split('?')[0])
            product_loader.add_xpath(
                'name', u'.//div[@class="prodDecription"]/a/text()')
            identifier = product.select(
                u'.//input[@name="/com/castorama/CastShoppingCartFormHandler.productId"]/@value'
            ).extract()
            if not identifier:
                identifier = product.select(
                    './/div[@class="illustration"]//img/@productid').extract()
            if (identifier and not identifier[0].strip()) or not identifier:
                identifier = re.search(r'-([\w]*)\.html', url).groups()
            product_loader.add_value('identifier', identifier[0])
            try:
                product_loader.add_value('image_url',
                                         urljoin_rfc(get_base_url(response),
                                                     product\
                                                     .select('.//div[@class="illustration"]//img/@src').extract()[0]
                                                     ))
            except:
                pass
            price_css_classes = [{
                'tag': 'span',
                'class': 'newprice'
            }, {
                'tag': 'div',
                'class': 'price'
            }]
            for price_css_class in price_css_classes:
                price = product.select(u'.//' + price_css_class['tag'] +
                                       '[@class="' + price_css_class['class'] +
                                       '"]/text()').re(u'([0-9\,\.\xa0]+)')
                if price:
                    price = price[0].replace(u'\xa0',
                                             '').replace(' ',
                                                         '').replace(',', '.')
                    product_loader.add_value('price', price)
                    break
            # if not product_loader.get_output_value('price'):
            product_loader.add_value('stock', 1)
            try:
                yield product_loader.load_item()
            except:
                self.log('>>> WARNING: load item error in => %s' %
                         response.url)

        if not products or not featured_product:
            log.msg('Retrying url: %s' % response.url, level=log.WARNING)
            retries = response.meta.get('retries', 0)
            if retries < 3:
                yield Request(response.url,
                              dont_filter=True,
                              meta={'retries': retries + 1})
예제 #3
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        name = hxs.select(
            '//div[@class="product-name"]/h1/text()')[0].extract()

        price = hxs.select(
            '//div[@class="product-main-info"]//div[@class="price-box"]/'
            'span[contains(@id, "product-price")]/span[@class="price"]/text()'
        ).extract()
        if not price:
            price = hxs.select(
                '//div[@class="product-main-info"]//div[@class="price-box"]/'
                'p[@class="special-price"]/span[@class="price"]/text()'
            ).extract()
        price = extract_price(price[0].strip())

        identifier = hxs.select('//p[@class="product-ids"]/text()').re(
            'Product ID: (.*)')[0]
        image_url = hxs.select('//a[@id="main-image"]/@href').extract()
        if image_url:
            image_url = urljoin_rfc(base_url, image_url[0])
        category = hxs.select(
            '//div[@class="breadcrumbs"]//a/text()').extract()

        options = hxs.select('//ul[contains(@class, "options-list")]/li')
        if options:
            i = 0
            for opt in options:
                opt_name = opt.select(
                    './span[@class="label"]/label/text()').extract()
                if not opt_name:
                    continue
                opt_name = name + ' ' + opt_name[0].strip()

                opt_price = opt.select('./input/@price').extract()
                if not opt_price:
                    continue
                opt_price = price + extract_price(opt_price[0])

                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('url', response.url)
                loader.add_value('name', opt_name)
                loader.add_value('price', opt_price)
                loader.add_value('sku', identifier)
                loader.add_value('identifier', '%s.%s' % (identifier, i))
                if image_url:
                    loader.add_value('image_url', image_url)
                if category:
                    loader.add_value('category', category[-1])
                if not loader.get_output_value('price'):
                    loader.add_value('stock', 0)

                yield loader.load_item()
                i += 1
        else:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('url', response.url)
            loader.add_value('name', name)
            loader.add_value('price', price)
            loader.add_value('sku', identifier)
            loader.add_value('identifier', identifier)
            if image_url:
                loader.add_value('image_url', image_url)
            if category:
                loader.add_value('category', category[-1])
            if not loader.get_output_value('price'):
                loader.add_value('stock', 0)

            yield loader.load_item()
예제 #4
0
    def parse(self, response):
        base_url = get_base_url(response)

        transport = paramiko.Transport((CLIENTS_SFTP_HOST, CLIENTS_SFTP_PORT))
        password = "******"
        username = "******"
        transport.connect(username=username, password=password)
        sftp = paramiko.SFTPClient.from_transport(transport)
        files = sftp.listdir_attr()

        last = get_last_file("CRC_PRICEFEED_UK", files)

        date_file = datetime.fromtimestamp(last.st_mtime)
        hours_diff = (datetime.now() - date_file).total_seconds() / 3600

        # Check file updates
        if hours_diff >= 72:
            self.errors.append('WARNING: No Update for 3 days')
        '''
        usa_file = get_last_file("CRC_PRICEFEED_USA", files)
        if usa_file:
            usa_date_file = datetime.fromtimestamp(usa_file.st_mtime) 
            hours_diff = (datetime.now() - usa_date_file).total_seconds() / 3600

            if hours_diff <= 32:
                self.errors.append('WARNING: Invalid File Name, USA feed uploaded recently')
        '''
        zip_path = HERE + '/CRC_PRICEFEED_UK.zip'
        xml_path = HERE + '/CRC_PRICEFEED_UK.xml'

        sftp.get(last.filename, zip_path)

        unzip(zip_path, xml_path)

        xmlfeed_sku = ''
        with open(xml_path) as f:
            xmlfeed_sku = f.read()

        sku_prices = {}
        tree = et.fromstring(xmlfeed_sku)
        for item in tree.find('priceList[@id="UKRP"]').find('prices').findall(
                'price'):
            sku = item.find('skuId').text
            price = item.find('listPrice').text
            sku_prices[sku] = price

        last = get_last_file("PriceMonitorHandler", files)

        zip_path = HERE + '/PriceMonitorHandler.zip'
        xml_path = HERE + '/PriceMonitorHandler.xml'

        sftp.get(last.filename, zip_path)

        unzip(zip_path, xml_path)

        xmlfeed_products = ''
        with open(xml_path) as f:
            xmlfeed_products = f.read()

        sku_products = {}
        tree = et.fromstring(xmlfeed_products)
        for item in tree.find('skus').findall('sku'):
            sku_products[item.find('skuID').text] = {
                'identifier': item.find('skuID').text,
                'category': item.find('CategoryDescription').text,
                'brand': item.find('BrandDescription').text,
                'image_url': item.find('ImageURL').text,
                'url': item.find('ProductURL').text,
                'name': item.find('SkuDescription').text,
                'sku': item.find('skuID').text,
                'stock': item.find('SkuQuantity').text
            }

        for sku, price in sku_prices.iteritems():
            try:
                product = sku_products[sku]
            except KeyError:
                log.msg('SKU not found:' + sku)
                continue

            product['price'] = price
            product = Product(product)

            loader = ProductLoader(response=response, item=product)
            yield loader.load_item()
예제 #5
0
    def load_item(self, item, name, identifier, price, response):
        try:
            category = item.select(
                '//*[@id="vi-VR-brumb-lnkLst"]//a/text()').extract().pop()
        except IndexError:
            category = ''
        seller_id = ''.join(
            item.select('.//*[contains(@class, "si-content")]'
                        '//a/*[@class="mbg-nw"]/text()').extract())

        brand = response.meta['item_meta'].get('brand')
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                item.select(
                    '//*[@class="attrLabels" and contains(text(), "Brand")]'
                    '/following-sibling::*[1]/text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                item.select(
                    '//*[@class="attrLabels" and contains(text(), "Brand")]'
                    '/following-sibling::*[1]/h2/text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                item.select(
                    '//*[@class="attrLabels" and contains(text(), "Brand")]'
                    '/following-sibling::*[1]/h3/text()').extract())

        product_loader = ProductLoader(item=Product(), selector=item)
        for field in self._match_fields:
            product_loader.add_value(
                field, response.meta['item_meta'].get(field, None))
        product_loader.add_value('name', name)
        product_loader.add_value('category', category)
        product_loader.add_value('dealer', 'eBay - ' + seller_id)
        product_loader.add_value('identifier', identifier)

        sku = item.select(
            '//tr[td[contains(text(), "Modell")]]/td/span/text()').extract()
        sku = sku[-1] if sku else ''
        product_loader.add_value('sku', sku)
        if brand:
            if type(brand) == list:
                product_loader.add_value('brand', brand[0])
            else:
                product_loader.add_value('brand', brand)
        product_loader.add_xpath('image_url', '//img[@id="icImg"]/@src')
        product_loader.add_value('url', item.response.url)
        price = extract_price(
            price) if price is not None else self._get_item_price(item)
        product_loader.add_value('price', price)

        # stock amount
        if self._extract_stock_amount:
            stock = ''
            try:
                in_stock = ''.join(
                    item.select('//*[@id="qtySubTxt"]//text()').extract())
                stock = ''
                for match in re.finditer(r"([\d]+)", in_stock):
                    if len(match.group()) > len(stock):
                        stock = match.group()
                if 'More than' in in_stock:
                    stock = 11
            except:
                pass
            if stock:
                product_loader.add_value('stock', stock)

        # shipping cost
        try:
            shipping_cost = item.select(
                '//*[@id="shippingSection"]//td/div/text()').extract()[0]
            if shipping_cost:
                if 'free' in shipping_cost.lower():
                    product_loader.add_value('shipping_cost', 0)
                else:
                    product_loader.add_value('shipping_cost',
                                             extract_price(shipping_cost))
        except IndexError:
            pass

        return product_loader
예제 #6
0
    def parse_item(self, response):
        '''
                skuArray.push({
                    productexternalid: 72833,
                    colour: 'Light Grey/Grey',
                    size: '49',
                    skuNopId: 91684,
                    skuId: 227272,
                    price: '£90.00',
                    priceAsDecimal: 90.0000,
                    stockquantity: 0,
                    preorder: true,
                    outofstock: true,
                    issubscribed: false,
                    availableDate: 'Due in 02/07/2015'
                    });
        '''
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products_data = []
        collect_product = False
        for i, l in enumerate(response.body.split('\n')):
            if 'skuArray.push({' in l:
                collect_product = True
                current_product = {}
                continue
            if '});' in l and collect_product:
                collect_product = False
                products_data.append(current_product)
                continue
            if collect_product:
                attr_data = [a.strip() for a in l.split(':')]
                current_product[attr_data[0]] = eval(attr_data[1].replace(
                    'false', 'False').replace('true', 'True'))
                if isinstance(current_product[attr_data[0]], tuple):
                    current_product[attr_data[0]] = current_product[
                        attr_data[0]][0]

        main_name = hxs.select(
            '//h1[@itemprop="name"]/text()').extract()[0].strip()
        categories = hxs.select(
            '//div[@id="breadcrumb"]//span[@itemprop="title"]/text()').extract(
            )[1:]

        for p in products_data:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_xpath(
                'image_url', '//img[@itemprop="image"]/@src',
                lambda a: urljoin_rfc(base_url, a[0]) if a else '')
            loader.add_value('identifier', p['skuId'])
            loader.add_value('sku', p['productexternalid'])
            loader.add_value('price', p['priceAsDecimal'])
            loader.add_value('stock', p['stockquantity'])
            loader.add_value('category', categories)
            loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
            loader.add_value('url', response.url)
            loader.add_value(
                'name', main_name + ' - ' + p['colour'] + ' - ' + p['size'])

            yield loader.load_item()
예제 #7
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        image_url = hxs.select(
            '//meta[@property="og:image"]/@content').extract()
        if image_url:
            image_url = urljoin_rfc(base_url, image_url[0])

        category = hxs.select(
            '//td[contains(@class,"breadcrumb")]//a/text()').extract()
        if category:
            category = category[-1].strip()

        brand = hxs.select(
            '//div[contains(@id, "ProductDetail_Tech")]//table//tr/td[contains(text(),"Manufacturer")]/following-sibling::td/text()'
        ).extract()
        if brand:
            brand = brand[0].strip()

        out_of_stock = hxs.select(
            '//td[contains(@id,"productdetail-action-wrapper")]//span[contains(text(),"Call for best price!")]/text()'
        )

        sub_products = hxs.select('//tr[@class="Multi-Child_Background"]')
        if sub_products:
            for sub_product in sub_products:
                loader = ProductLoader(item=Product(), selector=sub_product)
                loader.add_xpath(
                    'name',
                    'td[@class="productnamecolorSMALL colors_productname"]/text()'
                )
                loader.add_xpath('sku',
                                 'td[@class="smalltext colors_text"]/text()')
                loader.add_xpath('identifier',
                                 'td[@class="smalltext colors_text"]/text()')
                loader.add_value('url', response.url)
                loader.add_xpath(
                    'price',
                    'td[@class="smalltext colors_text"]/b/div/div/span/text()')
                loader.add_value('category', category)
                loader.add_value('image_url', image_url)
                loader.add_value('brand', brand)
                if out_of_stock:
                    loader.add_value('stock', 0)
                yield loader.load_item()
        else:
            if hxs.select('//table[@id="options_table"]//select'):
                select_options = []
                for select in hxs.select(
                        '//table[@id="options_table"]//select'):
                    select_options.append(
                        select.select('option/text()').extract())
                name = hxs.select(
                    '//span[@itemprop="name"]/text()').extract()[0]
                full_names = select_options[0]
                for i, full_name in enumerate(full_names):
                    for options in select_options[1:]:
                        for option in options:
                            full_names[i] = full_names[i] + ' ' + option
                for full_name in full_names:
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('name', name + ' ' + full_name)
                    loader.add_xpath('sku',
                                     '//span[@class="product_code"]/text()')
                    loader.add_xpath('identifier',
                                     '//span[@class="product_code"]/text()')
                    loader.add_value('url', response.url)
                    price = hxs.select(
                        '//span[@itemprop="price"]/text()').extract()
                    price = price[0] if price else 0
                    loader.add_value('price', price)
                    loader.add_value('category', category)
                    loader.add_value('image_url', image_url)
                    loader.add_value('brand', brand)
                    if out_of_stock:
                        loader.add_value('stock', 0)
                    yield loader.load_item()
            else:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_xpath('name', '//span[@itemprop="name"]/text()')
                loader.add_xpath('sku', '//span[@class="product_code"]/text()')
                loader.add_xpath('identifier',
                                 '//span[@class="product_code"]/text()')
                loader.add_value('url', response.url)
                loader.add_value('category', category)
                loader.add_value('image_url', image_url)
                loader.add_value('brand', brand)
                if out_of_stock:
                    loader.add_value('stock', 0)
                price = hxs.select(
                    '//span[@itemprop="price"]/text()').extract()
                price = price[0] if price else 0
                loader.add_value('price', price)
                yield loader.load_item()
예제 #8
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        sold_out = hxs.select(
            "//form/img[contains(concat('',@src,''), 'soldout')]"
        ).extract()

        # Fill up the Product model fields
        # identifier =
        url = response.url
        name = hxs.select("//div[@class='product-order']/h1/text()").extract()[0]
        shipping_cost = ''
        price = ''
        if sold_out:
            pass
        else:
            price = hxs.select(
                "//div[@id='pit']//li[@class='rbsalep']/text()").extract()
            if not price:
                price = hxs.select(
                    "//div[@id='pit']//ul/table//table//tr[2]/td[2]/text()"
                    ).extract()
                if not price:
                    price = ''

            if not price:
                log.msg(' ::::: Base price :::::')
                log.msg(response.url)
                price = hxs.select('//input[@id="baseprice"]/@value').extract()
                if price:
                    price = price[0]
                else:
                    price = ''

            category = response.meta["cat_name"] if 'cat_name' in response.meta else response.meta['_product']['category']

            image_url = hxs.select("//div[@class='details-left']/table/tr/td/a/img/@src").extract()

            brand = hxs.select('//div[@class="about-item"]/ul/li[contains(b/text(),"Manufacturer:")]/text()').extract()
            if not brand:
                self.log("ERROR brand not found")
                brand = ''

            sku = hxs.select('//div[@class="about-item"]/ul/li[contains(b/text(),"SKU:")]/text()').extract()
            if not sku:
                self.log("ERROR sku not found")
                sku = ''
            else:
                sku = sku[0]

            l = ProductLoader(response=response, item=Product())

            instock = hxs.select('//form[@id="cartForm"]//div[@id="addtocart"]/@id').extract()
            if instock:
                l.add_value("stock", int(1))
            else:
                outofstock = hxs.select('//form[@id="cartForm"]/img[contains(@src,"soldout.gif")]/@src').extract()
                if outofstock:
                    l.add_value("stock", int(0))
                else:
                    self.log("ERROR outofstock not found, instock not found")

            options = hxs.select('//select[@id="Options"]/option[@value!="Select Options"]')

            if options:
                for option in options:
                    l = ProductLoader(response=response, item=Product())
                    l.add_value('url', url)
                    option_name = option.select('text()').extract()[0]
                    option_id = option.select('@value').extract()[0]
                    l.add_value('name', name + ' - ' +option_name)
                    l.add_value('price', price)
                    l.add_value('sku', sku)
                    l.add_value("identifier", sku+'-'+option_id)
                    l.add_value('category', category)
                    l.add_value('image_url', image_url)
                    l.add_value('shipping_cost', shipping_cost)
                    if instock:
                        l.add_value("stock", int(1))
                    else:
                        l.add_value("stock", int(0))

                    if brand:
                        l.add_value('brand', brand)

                    yield l.load_item()
            else:
                l.add_value('url', url)
                l.add_value('name', name)
                l.add_value('price', price)
                l.add_value('sku', sku)
                l.add_value("identifier", sku)
                l.add_value('category', category)
                l.add_value('image_url', image_url)
                l.add_value('shipping_cost', shipping_cost)

                if brand:
                    l.add_value('brand', brand)

                yield l.load_item()
 def _start_requests(self):
     yield Request(
         'http://www.advantage-catering-equipment.co.uk/sterling-pro-triple-door-bottle-cooler.html',
         callback=self.parse_product,
         meta={'product': Product()})
예제 #10
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        name = ''.join(hxs.select('//h1//text()').extract()).strip()
        product_loader.add_value('name', name)
        product_loader.add_value('brand', 'CamelBak')
        category = hxs.select(
            '//div[@class="breadcrumb"]/ul/li/a/text()').extract()[1:]
        product_loader.add_value('category', category)

        options_values = hxs.select(
            '//script[contains(text(), "var allVariants={")]/text()').re(
                r'var variantsAray=(\[.*\]);')
        if options_values:
            options_values = eval(options_values[0])
        options = hxs.select(
            '//script[contains(text(), "var allVariants={")]/text()').re(
                r'allVariants={"variants":(\[.*\,])\}\;')
        if options:
            options = eval(options[0])

        option_images = {}
        media_json = re.findall("var mediaJSON='(.*)';if", response.body)
        if media_json and media_json[0]:
            images = json.loads(media_json[0])
            for image in images["imageList"]:
                sku = image.get('skuId', None)
                if sku:
                    option_image = hxs.select('//div[@data-value="' +
                                              image['colour'] +
                                              '"]/img/@src').extract()
                    image_url = option_image[0] if option_image else ''
                    if option_image:
                        image_url = add_or_replace_parameter(
                            option_image[0], 'wid', '500')
                        image_url = add_or_replace_parameter(
                            image_url, 'hei', '500')
                        option_images[image['skuId']] = image_url
                    else:
                        option_images[image['skuId']] = ''

            initial_image = images['initialImage']['imageURL']
            product_loader.add_value('image_url', initial_image)

        product = product_loader.load_item()

        if options and options_values:
            for option in options:
                prod = Product(product)
                sku = option['skuId']
                prod['identifier'] = sku
                prod['sku'] = sku
                prod['name'] = prod['name'].strip() + ' ' + ' '.join(
                    option[k] for k in options_values
                    if option[k] is not 'null').decode('utf-8')
                prod['price'] = extract_price(option['RP'])
                if option['isInStock'] != 'true':
                    prod['stock'] = 0
                if option_images and option_images.get(sku, ''):
                    prod['image_url'] = option_images.get(sku, '')

                if prod['price'] < 50:
                    prod['shipping_cost'] = 5.99
                yield prod
        else:
            yield product
예제 #11
0
    def parse_product_option(self, response):
        if "The item is not currently available." in response.body:
            return
        option_name = response.meta.get('option_name')
        option_id = response.meta.get('option_id')
        category = response.meta.get('category')
        name = response.meta.get('name')
        url = response.meta.get('url')
        sku = response.xpath(
            '//span[@itemprop="productID"]/text()').extract_first()
        name += ' ' + option_name
        price = response.xpath(
            '//input[@name="ActProdPrice"]/@value').extract_first()
        price = extract_price(price)
        image_url = response.xpath('//*[@id="main_img"]/@src').extract_first()
        brand = response.xpath(
            '//input[@name="ProdMfgName"]/@value').extract_first()
        out_of_stock = response.xpath(
            '//div[@class="outofstockdiv itemgroup-outofstock"]'
        ).extract_first()
        identifier = response.xpath(
            '//input[@name="ProdID"]/@value').extract_first()
        identifier += '_' + option_id

        options_containers = response.xpath(
            '//div[@class="prodpageoptionvalue"]/select')
        combined_options = []
        for options_container in options_containers:
            element_options = []
            for option in options_container.xpath('./option[@value!=""]'):
                option_id = option.xpath('./@value').extract_first()
                option_name = option.xpath('./text()').extract_first()
                option_name, option_price = extract_option_price(option_name)
                element_options.append((option_id, option_name, option_price))
            combined_options.append(element_options)

        if len(options_containers) > 1:
            combined_options = list(itertools.product(*combined_options))
            for combined_option in combined_options:
                o_name, o_price, o_option_id = name, price, identifier
                for option in combined_option:
                    o_option_id = o_option_id + '_' + option[0]
                    if 'do not add' not in option[1].lower():
                        o_name = o_name + ' ' + option[1]
                        o_price = o_price + option[2]
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('name', o_name)
                loader.add_value('identifier', o_option_id)
                loader.add_value('sku', sku)
                loader.add_value('category', category)
                loader.add_value('url', url)
                loader.add_value('image_url', response.urljoin(image_url))
                loader.add_value('price', o_price)
                loader.add_value('brand', brand)
                if out_of_stock:
                    loader.add_value('stock', 0)
                if o_price < self.free_shipping_over:
                    loader.add_value('shipping_cost', self.shipping_cost)
                option_item = loader.load_item()
                metadata = KitBagMeta()
                metadata['size'] = response.meta['size']
                player_found = False
                for team, players in self.teams.iteritems():
                    for player_id, player in players.iteritems():
                        product_name = option_item['name'].upper()
                        player_name = player['name'].decode('utf')
                        if player_name.upper(
                        ) in product_name or product_name.split(
                        )[0] == player_name.upper():
                            metadata['player'] = player_name
                            metadata['number'] = player['number']
                            player_found = True
                            break
                    if player_found:
                        break
                option_item['metadata'] = metadata
                yield option_item
        else:
            o_name, o_price, o_option_id = name, price, identifier
            if combined_options:
                for option in combined_options[0]:
                    o_option_id = identifier + '_' + option[0]
                    if 'do not add' not in option[1].lower():
                        o_name = name + ' ' + option[1]
                        o_price = price + option[2]
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('name', o_name)
                    loader.add_value('identifier', o_option_id)
                    loader.add_value('sku', sku)
                    loader.add_value('category', category)
                    loader.add_value('url', url)
                    loader.add_value('image_url', response.urljoin(image_url))
                    loader.add_value('price', o_price)
                    loader.add_value('brand', brand)
                    if out_of_stock:
                        loader.add_value('stock', 0)
                    if o_price < self.free_shipping_over:
                        loader.add_value('shipping_cost', self.shipping_cost)
                    option_item = loader.load_item()
                    metadata = KitBagMeta()
                    metadata['size'] = response.meta['size']
                    player_found = False
                    for team, players in self.teams.iteritems():
                        for player_id, player in players.iteritems():
                            product_name = option_item['name'].upper()
                            player_name = player['name'].decode('utf')
                            if player_name.upper(
                            ) in product_name or product_name.split(
                            )[0] == player_name.upper():
                                metadata['player'] = player_name
                                metadata['number'] = player['number']
                                player_found = True
                                break
                        if player_found:
                            break
                    option_item['metadata'] = metadata
                    yield option_item
            else:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('name', o_name)
                loader.add_value('identifier', o_option_id)
                loader.add_value('sku', sku)
                loader.add_value('category', category)
                loader.add_value('url', url)
                loader.add_value('image_url', response.urljoin(image_url))
                loader.add_value('price', o_price)
                loader.add_value('brand', brand)
                if out_of_stock:
                    loader.add_value('stock', 0)
                if o_price < self.free_shipping_over:
                    loader.add_value('shipping_cost', self.shipping_cost)
                option_item = loader.load_item()
                metadata = KitBagMeta()
                metadata['size'] = response.meta['size']
                player_found = False
                for team, players in self.teams.iteritems():
                    for player_id, player in players.iteritems():
                        product_name = option_item['name'].upper()
                        player_name = player['name'].decode('utf')
                        if player_name.upper(
                        ) in product_name or product_name.split(
                        )[0] == player_name.upper():
                            metadata['player'] = player_name
                            metadata['number'] = player['number']
                            break
                    if player_found:
                        break
                option_item['metadata'] = metadata
                yield option_item
예제 #12
0
 def _start_requests(self):
     yield Request('http://www.notebooksbilliger.de/logitech+k830+illuminated+living+room+keyboard/eqsqid/dc034145-ba5e-417d-b751-99748adbb8b8', meta={'product':Product()}, callback=self.parse_product)
예제 #13
0
    def parse_product(self, response):
        log.msg(response.url)
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        identifier = response.url.split('/')[-1].split('-')[-1].split('.')[0]
        log.msg('Identifier: %s' % identifier)
        log.msg(repr(self.seen_ids))
        if identifier in self.seen_ids:
            return
        else:
            self.seen_ids.append(identifier)
        loader.add_value('identifier', identifier)
        sku = hxs.select('//p[@class="pmeta"]/text()').re('(\d+)')
        loader.add_value('sku', sku)
        name = hxs.select('//div[@class="prod-box"]/h1//text()').extract()
        extra_data = name[1].strip() if len(name) > 1 else ''
        loader.add_value('name', name[0])
        #price
        price = re.sub(
            '[\r\n\t]+', ' ',
            hxs.select(
                '//h5[@class="product-price"]//div[contains(@id,"StaticPrice")]/span/text()[normalize-space()]'
            )[0].extract())
        loader.add_value('price', price)
        #image_url
        image_url = hxs.select('//img[@class="product-image"]/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        #brand
        loader.add_value('brand', 'Le Creuset')
        #category
        category = hxs.select('//ul[@class="breadcrumbs"]')[0].select(
            './/a/text()').extract()
        loader.add_value('category', ' > '.join(category[2:]))
        #shipping_cost
        price = Decimal(loader.get_output_value('price'))
        if price < 20.00:
            loader.add_value('shipping_cost', '2.00')
        elif 20.00 <= price < 40.00:
            loader.add_value('shipping_cost', '4.99')

        product = loader.load_item()

        options = hxs.select('.//select/option[contains(@class,"%s")]' %
                             identifier)
        if options:
            sid = hxs.select(
                '//input[@type="hidden" and @name="SID"]/@value')[0].extract()
            stock_url = 'http://www.hartsofstur.com/cgi-bin/st000001.pl?ACTION=GETSTOCK&REF=%(identifier)s&SID=%(sid)s&timestamp=%(timestamp)s'
            items = []
            for option in options:
                item = copy.deepcopy(product)
                option_name = option.select('./text()')[0].extract().strip()
                option_identifier = option.select('./@class').re('_(\d+)_')[0]
                self.seen_ids.append(option_identifier)
                item['identifier'] = "%s_%s" % (identifier,
                                                option_identifier.strip())
                item['name'] += ' %s %s' % (option_name, extra_data)
                item['name'] = item['name'].strip()
                items.append(item)
            yield Request(stock_url % {
                'identifier': identifier,
                'sid': sid,
                'timestamp': int(time.time())
            },
                          meta={'items': items},
                          callback=self.parse_stock)
        else:
            product['name'] += ' %s' % extra_data
            yield product
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), response=response)
     loader.add_value('name', response.meta['name'])
     loader.add_xpath('price', '//*[@id="price-text"]/span/text()')
     yield loader.load_item()
예제 #15
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url
        sku = response.meta['sku']
        sec_sku = response.meta['notes']
        name = response.meta['name'].encode('ascii', 'ignore')

        main_product = hxs.select("//div[@id='Product-MainProduct']")
        main_products = hxs.select("//div[@id='Product-MainProductContainer']//div[@class='Product-SubProduct']")
        secondary_products = hxs.select("//div[@id='Product-SubProductContainer']//div[@class='Product-SubProduct']")

        main_product_sku = main_product.select(".//div[@id='Product-lblItem']/span[@id='lblItem']/text()").extract()
        if not main_product_sku:
            logging.error("NO MAIN SKU! %s" % url)
        else:
            main_product_sku = main_product_sku[0]

        if main_product_sku == sku or main_product_sku == sec_sku:
            # extract main product
            price = main_product.select(".//div[@class='Product-Price']/span[@id='lblClubPrice']/b/font/text()").re("\$(.*)")
            if not price:
                logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' % (sku, name, url))
                return
            price = price[0].strip()

            product = Product()
            loader = ProductLoader(item=product, response=response, selector=hxs)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)

            loader.add_value('sku', sku)

            yield loader.load_item()
            return
        elif main_products:
            for product in main_products:
                product_sku = product.select(".//div[@class='Product-SubProductNumber']/font/text()").re("#(.+)")
                if not product_sku:
                    logging.error("NO MAIN SKU! %s" % url)
                else:
                    product_sku = product_sku[0]

                if product_sku == sku or product_sku == sec_sku:
                    # extract secondary product
                    price = product.select(".//span[contains(@id, 'lblClubPrice')]/b/font/text()").re("\$(.*)")
                    if not price:
                        logging.error('ERROR!! NO SEC PRICE!! %s "%s" "%s"' % (sku, name, url))
                        return
                    price = price[0].strip()

                    product = Product()
                    loader = ProductLoader(item=product, response=response, selector=hxs)
                    loader.add_value('url', url)
                    loader.add_value('name', name)
                    loader.add_value('price', price)

                    loader.add_value('sku', sku)

                    yield loader.load_item()
                    return
        elif secondary_products:
            for product in secondary_products:
                product_sku = product.select(".//div[@class='Product-SubProductNumber']/text()").re("#(.+)")
                if not product_sku:
                    logging.error("NO SECONDARY SKU! %s" % url)
                else:
                    product_sku = product_sku[0]

                if product_sku == sku or product_sku == sec_sku:
                    # extract secondary product
                    price = product.select(".//span[contains(@id, 'lblClubPrice2')]/b/font/text()").re("\$(.*)")
                    if not price:
                        logging.error('ERROR!! NO SEC PRICE!! %s "%s" "%s"' % (sku, name, url))
                        return
                    price = price[0].strip()

                    product = Product()
                    loader = ProductLoader(item=product, response=response, selector=hxs)
                    loader.add_value('url', url)
                    loader.add_value('name', name)
                    loader.add_value('price', price)

                    loader.add_value('sku', sku)

                    yield loader.load_item()
                    return
        else:
            logging.error("No products found!")
예제 #16
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        # identifier =
        url = response.url
        # sku =
        # metadata =
        category = response.meta["cat_name"]
        image_url = hxs.select(
            "//div[contains(@class, 'product-img-box')]"
            "/p[contains(@class, 'product-image')]/img/@src").extract()
        # brand =
        # shipping_cost =

        colours = hxs.select("//div[@class='colours']//input").extract()
        if colours and len(colours) > 1:
            _script = hxs.select(
                "//script[contains(text(), 'spConfig')]/text()"
                ).extract()[0].split("(")
            script = "".join(_script[1:]).split(',"priceFromLabel"')[0] + '}'
            js = json.loads(script)
            for s in js['attributes']['76']['options']:
                color = s['label']
                code = s['products'][0]
                u = js['childProducts'].get(code)

                name = hxs.select(
                    "//div[@class='product-name']/h1/text()"
                    ).extract()[0] + " " + color
                price = u['finalPrice']
                if not price:
                    price = ""

                l = ProductLoader(response=response, item=Product())
                # l.add_value('identifier', identifier)
                l.add_value('url', url)
                l.add_value('name', name)
                l.add_value('price', price)
                # l.add_value('sku', sku)
                # l.add_value('metadata', metadata)
                l.add_value('category', category)
                l.add_value('image_url', image_url)
                # l.add_value('brand', brand)
                # l.add_value('shipping_cost', shipping_cost)
                yield l.load_item()
        else:
            name = hxs.select(
                "//div[@class='product-name']/h1/text()").extract()
            price = hxs.select(
                "//div[@class='price-box']//span[@class='regular-price']"
                "/span/text()").extract()
            if not price:
                price = ""

            l = ProductLoader(response=response, item=Product())
            # l.add_value('identifier', identifier)
            l.add_value('url', url)
            l.add_value('name', name)
            l.add_value('price', price)
            # l.add_value('sku', sku)
            # l.add_value('metadata', metadata)
            l.add_value('category', category)
            l.add_value('image_url', image_url)
            # l.add_value('brand', brand)
            # l.add_value('shipping_cost', shipping_cost)
            yield l.load_item()
예제 #17
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        brand = response.meta.get('brand', '')

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//div[@id="productname"]/text()')
        loader.add_value('url', response.url)
        loader.add_value('brand', brand)
        #categories = hxs.select('//ul[@class="breadcrumbs"]//a/text()').extract()[0:-1]
        loader.add_value('category', response.meta.get('brand', ''))

        identifier = response.xpath(
            '//input[@name="product"]/@value').extract()
        if not identifier:
            log.msg('PRODUCT WHIOUT IDENTIFIER: ' + response.url)
            return

        loader.add_value('sku', identifier[0])
        loader.add_value('identifier', identifier[0])
        image_url = response.css('.main-image img::attr(src)').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        price = response.xpath('//span[@id="product-price-' + identifier[0] +
                               '" and @class="price"]/text()').extract()
        if not price:
            price = response.xpath('//span[@id="product-price-' +
                                   identifier[0] +
                                   '"]/span[@class="price"]/text()').extract()

        if not price:
            price = hxs.select(
                '//div[@id="product_price"]//span[@class="price"]/text()'
            ).extract()

        loader.add_value('price', price[-1])

        in_stock = response.xpath('//p[@class="availability in-stock"]')
        if not in_stock:
            loader.add_value('stock', '0')

        if loader.get_output_value('price') <= 49.99:
            loader.add_value('shipping_cost', 2.95)

        item = loader.load_item()

        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join(
                            (products.get(product, ''), option['label']))
                        prices[product] = prices.get(
                            product, 0) + extract_price(option['price'])

            for option_identifier, option_name in products.iteritems():
                option_item = deepcopy(item)

                option_item['identifier'] += '-' + option_identifier
                option_item['name'] += option_name
                option_item['price'] += prices[option_identifier]
                if not option_item['price']:
                    option_item['stock'] = 0

                if option_item['price'] <= 49.99:
                    option_item['shipping_cost'] = 2.95

                yield option_item
        else:
            options_bundle = re.search(r'new Product.Bundle\((.*)\)',
                                       response.body)
            if options_bundle:
                log.msg('OPTION BUNDLE: ' + response.url)
                combined_options = []
                product_data = json.loads(options_bundle.groups()[0])
                for id, options in product_data['options'].iteritems():
                    element_options = []
                    for option_id, option in options['selections'].iteritems():
                        option_id = option_id
                        option_name = option['name']
                        option_attr = (option_id, option_name)
                        element_options.append(option_attr)
                    combined_options.append(element_options)
                combined_options = list(itertools.product(*combined_options))
                options = []
                for combined_option in combined_options:
                    final_option = {}
                    for option in combined_option:
                        final_option['desc'] = final_option.get(
                            'desc', '') + ' ' + option[1]
                        final_option['identifier'] = final_option.get(
                            'identifier', '') + '-' + option[0]
                    options.append(final_option)
                for option in options:
                    option_item = deepcopy(item)

                    option_item['identifier'] += option['identifier']
                    option_item['name'] += option['desc']
                    #option_item['price'] += prices[option_identifier]
                    if not option_item['price']:
                        option_item['stock'] = 0

                    if option_item['price'] <= 49.99:
                        option_item['shipping_cost'] = 2.95

                    yield option_item

            else:
                yield item
예제 #18
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        category = response.xpath(
            '//table[@class="history-menu-table"]//a/text()').extract()[1:-2]
        image_url = response.xpath(
            '//img[@id="ModelsDisplayStyle1_ImgModel"]/@src').extract()
        image_url = response.urljoin(image_url[0]) if image_url else ''

        product_brand = response.xpath(
            '//a[@class="brand-image-link"]/@title').extract()
        product_brand = product_brand[0].strip() if product_brand else ''

        shipping_cost = response.xpath(
            '//span[@id="ModelsDisplayStyle1_LblPostageCostValue"]/text()'
        ).extract()
        shipping_cost = extract_price(shipping_cost[0]) if shipping_cost else 0

        name = ' '.join(
            response.xpath(
                '//h1/span[not(@class="models-page-title-price")]/text()').
            extract())
        options = response.xpath('//tr[contains(@class, "item-row")]')
        if options:
            # options
            for option in options:
                name2 = option.xpath(
                    './/a[contains(@class, "option-text")]/text()').extract()
                if not name2:
                    name2 = option.xpath(
                        './/span[contains(@class, "option-text")]/text()'
                    ).extract()
                option_name = name + ' ' + name2[0].strip() if name2 else name

                price = option.xpath(
                    './/span[contains(@class, "price-label")]/text()').extract(
                    )[0]
                sku = option.xpath(
                    './/td[contains(@class, "item-part-code")]/text()'
                ).extract()[0].strip()

                identifier = option.xpath(
                    './/a[@class="add-to-basket-button"]/@href').re(
                        'StockID=(\d+)')
                if not identifier:
                    identifier = option.xpath(
                        './/a[@class="request-stock-alert-link"]/@onclick').re(
                            'StockID=(\d+)')
                identifier = identifier[0]
                loader = ProductLoader(item=Product(), selector=option)
                loader.add_xpath('identifier', identifier)
                loader.add_value('sku', sku)
                loader.add_value('url', response.url)
                loader.add_value('name', option_name)
                loader.add_value('price', price)
                loader.add_value('category', category)
                loader.add_value('image_url', image_url)
                loader.add_value('brand', product_brand)
                in_stock = option.xpath(
                    './/td[contains(@class, "item-in-stock")]')
                if not in_stock:
                    loader.add_value('stock', 0)
                else:
                    stock_level = in_stock.re('\d+')
                    if stock_level:
                        loader.add_value('stock', int(stock_level[0]))
                if loader.get_output_value('price') < 50:
                    loader.add_value('shipping_cost', shipping_cost)

                yield loader.load_item()

        if not options:
            options = response.xpath(
                '//input[contains(@id, "HidStockOptionDetails")]')
            if options:
                for option in options:
                    option_data = json.loads(
                        option.xpath('@value').extract()[0])
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('url', response.url)
                    loader.add_value('name',
                                     name + ' ' + option_data['option'])
                    loader.add_value('price',
                                     extract_price(str(option_data['price'])))
                    loader.add_value('identifier', option_data['stockID'])
                    loader.add_value('image_url', image_url)
                    loader.add_value('category', category)
                    loader.add_value('sku', option_data['partcode'])
                    loader.add_value('brand', product_brand)
                    stock_level = re.findall(
                        '\d+',
                        json.loads(option.xpath('@value').extract()[0])
                        ['stockLevelText'])
                    if stock_level:
                        loader.add_value('stock', int(stock_level[0]))
                    else:
                        self.log('POSSIBLE OUT OF STOCK : ' + response.url)
                    if loader.get_output_value('price') < 50:
                        loader.add_value('shipping_cost', shipping_cost)
                    yield loader.load_item()
            else:
                self.log(' >>> NO OPTIONS FOUND: ' + response.url)
                price = "".join(
                    hxs.select(".//span[@class='bigprice']/text()").re(
                        r'([0-9\,\. ]+)')).strip()
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('url', response.url)
                loader.add_value('name', name)
                loader.add_value('price', price)
                loader.add_value('identifier', response.url)
                loader.add_value('image_url', image_url)
                loader.add_value('category', category)
                loader.add_xpath('sku', './td[position()=2]/text()')
                loader.add_value('brand', product_brand)
                if loader.get_output_value('price') < 50:
                    loader.add_value('shipping_cost', shipping_cost)
                yield loader.load_item()
예제 #19
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        redirected_urls = response.meta.get('redirect_urls', None)
        if redirected_urls:
            log.msg('Skips product, redirected url: ' +
                    str(redirected_urls[0]))
            return

        image_url = hxs.select('//a[@id="cloud_zoom"]/img/@src').extract()
        try:
            product_identifier = hxs.select(
                '//input[@name="product"]/@value').extract()[0].strip()
        except:
            product_identifier = hxs.select(
                '//form[@id="product_addtocart_form"]/@action').re(
                    r'/product/(\d+)')[0]
        product_name = hxs.select(
            '//div[@class="product-name"]/h1/text()').extract()[0].strip()
        category = response.meta.get('category')
        brand = hxs.select(
            '//div[contains(@class, "product-shop")]/a/img/@title').extract()
        brand = brand[0].strip() if brand else ''
        out_of_stock = hxs.select(
            '//p[@class="availability out-of-stock"]').extract()

        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join(
                            (products.get(product, ''), option['label']))

            for identifier, option_name in products.iteritems():
                product_loader = ProductLoader(item=Product(), selector=hxs)
                sku = product_identifier + '_' + identifier
                product_loader.add_value('identifier', sku)
                product_loader.add_value('sku', sku)
                product_loader.add_value('name', product_name + option_name)
                if image_url:
                    product_loader.add_value(
                        'image_url', urljoin_rfc(base_url, image_url[0]))
                price = float(product_data['basePrice'])
                product_loader.add_value('price', round(price, 2))
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', brand)
                product_loader.add_value('category', category)
                if price < 25:
                    product_loader.add_value('shipping_cost', 2.99)
                else:
                    product_loader.add_value('shipping_cost', 0)
                if out_of_stock:
                    product_loader.add_value('stock', 0)
                product = product_loader.load_item()
                yield product
        else:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('sku', product_identifier)
            product_loader.add_value('name', product_name)
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            price = hxs.select('//*[@id="product-price-{}"]//text()'.format(
                product_identifier)).extract()
            price = ''.join(price).strip()
            if price == '':
                price = hxs.select('//*[@id="old-price-{}"]//text()'.format(
                    product_identifier)).extract()
                price = ''.join(price).strip()
            price = extract_price(price)
            product_loader.add_value('price', price)
            product_loader.add_value('url', response.url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            if price < 25:
                product_loader.add_value('shipping_cost', 2.99)
            else:
                product_loader.add_value('shipping_cost', 0)
            if out_of_stock:
                product_loader.add_value('stock', 0)
            product = product_loader.load_item()
            yield product
예제 #20
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        brand = hxs.select('//meta[@property="og:brand"]/@content').extract()
        brand = brand[0] if brand else ''
        category = response.meta.get('category', '')

        product_config_reg = re.search('var spConfig=new Product.Config\((.*)\).*var original_product_name;', response.body, flags=re.DOTALL)
        if not product_config_reg:
            product_config_reg = re.search('var spConfig = new Product.Config\((.*)\).*var original_product_name;', response.body, flags=re.DOTALL)
        if not product_config_reg:
            product_config_reg = re.findall(re.compile('var spConfig = new Product.Config\((.*)\).*'), response.body)
            product_config_reg = product_config_reg[0] if product_config_reg else ''

        if product_config_reg:

            try:
                products = json.loads(product_config_reg.group(1))
            except:
                products = json.loads(product_config_reg)

            for attr_id, attribute in products[u'attributes'].items():
                for option in attribute['options']:
                    option = option['productsData'][0]
                    loader = ProductLoader(item=Product(), response=response)
                    loader.add_value('identifier', option[0])
                    loader.add_value('url', response.url)
                    loader.add_value('image_url', option[3][0] if option[3] else '')
                    loader.add_value('brand', brand)
                    loader.add_value('category', category)
                    loader.add_value('stock', option[4])
                    loader.add_value('name', option[5])
                    loader.add_value('shipping_cost', extract_price(option[6]))
                    loader.add_value('price', option[2])
                    loader.add_value('sku', option[1])

                    item = loader.load_item()

                    if item['identifier'] not in self.identifiers:
                        self.identifiers.append(item['identifier'])
                        yield item

        else:
            stock = hxs.select('//span[@class="stock_value"]/span/text()').re(r'(\d+)')
            price = hxs.select('//span[contains(@id, "product-price-")]/span[@class="price"]/text()').extract()
            if not price:
                price = hxs.select('//span[contains(@id, "product-price-") and @class="price"]/text()').extract()
            if not price:
                price = hxs.select('//span[contains(@class, "old-price")]/span[@class="price"]/text()').extract()
            shipping_cost = hxs.select('//span[contains(@id, "product-price-")]/span[contains(@class, "price-delivery")]/text()').extract()
            if not shipping_cost:
                shipping_cost = hxs.select('//span[contains(@id, "product-price-") and contains(@class, "price")]'
                    '/following-sibling::span[contains(@class, "price-delivery")]/text()').extract()

            loader = ProductLoader(item=Product(), response=response)
            loader.add_xpath('identifier', '//input[@name="product"]/@value')
            loader.add_value('url', response.url)
            loader.add_xpath('image_url', '//div[@id="product-images"]//img[@class="img-responsive"]/@src')
            loader.add_value('brand', brand)
            loader.add_value('category', category)
            if stock:
                loader.add_value('stock', stock[0])
            else:
                loader.add_value('stock', 0)
            loader.add_xpath('name', '//div[contains(@class, "product-name")]/*[self::h1 or self::h2]/text()')
            loader.add_value('shipping_cost', shipping_cost)
            loader.add_value('price', price)
            loader.add_xpath('sku', '//div[contains(@class, "product-name")]//span[@class="sku_value"]/text()')

            item = loader.load_item()
            if 'identifier' not in item:
                self.log("Warning: no identifier found, skiping product")
                return
            if item['identifier'] not in self.identifiers:
                self.identifiers.append(item['identifier'])
                yield item
예제 #21
0
    def parse_product(self, response):
        if 'aspxerrorpath' in response.url:
            yield Request(response.request.meta['redirect_urls'][0],
                          self.parse_product,
                          dont_filter=True)
        aud_url = response.xpath(
            '//a[contains(@href, "?cur=AUD")]/@href').extract_first()
        if aud_url:
            yield Request(response.urljoin(aud_url),
                          self.parse_product,
                          dont_filter=True)
            return
        base_product = True
        add_custom_personalization = False
        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('url', '//link[@rel="canonical"]/@href')
        loader.add_value('category', 'Kits')
        heros_data = response.xpath('//script/text()').re(
            'product\d{7} =(.+?});var')
        base_product_data = response.xpath('//script/text()').re(
            'product\w{6} =(.+?});var')
        if not base_product_data:
            for p in self.parse(response):
                yield p
            return
        if not heros_data:
            data = json.loads(base_product_data[0])
        elif len(heros_data) == 1:
            data = json.loads(heros_data[0])
            base_product = False
        else:
            data = [json.loads(x) for x in heros_data]
            data = {x['ProductID']: x for x in data}
            heros = response.css('select.heroShirts')
            hero = heros.xpath('option[@selected]')
            if not hero:
                data = json.loads(base_product_data[0])
            else:
                data = data[int(hero.xpath('@value').extract_first())]
                base_product = False

        base_product_data = json.loads(base_product_data[0])

        #Checking custom personalization
        printings = {
            p['PrintingTypeID']: p
            for p in base_product_data['printingitems']
        }
        custom_printings = printings.get(1)
        if custom_printings and base_product:
            add_custom_personalization = True

        loader.add_value('name', data['Description'])
        loader.add_xpath('sku', '//script/text()', re='sku":"(.+?)"')
        if data['Brand']:
            loader.add_value('brand', data['Brand']['Name'])
        loader.add_value('image_url', response.urljoin(data['ImageURL']))
        product = loader.load_item()
        player_from_name = re.search('(?!Sponsor).*with *([\w\ \.\-]+?) (\d+)',
                                     data.get('Description', ''), re.UNICODE)
        if player_from_name:
            player_name, number = player_from_name.groups()

        #sizes
        for variation in data['Variations']:
            size = variation['Description']
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value(None, product)
            loader.replace_value('identifier', variation['VariationId'])
            loader.add_value('name', size)
            loader.replace_value('price', variation['PriceActual'])
            if self.free_delivery_over is not None and self.free_delivery_over > loader.get_output_value(
                    'price'):
                loader.replace_value('shipping_cost', self.shipping_cost)
            loader.replace_value('stock', int(variation['IsInStock']))
            item = loader.load_item()
            if player_from_name:
                item['metadata'] = {
                    'player': player_name,
                    'number': number,
                    'size': size
                }
            else:
                item['metadata'] = {'size': size}
            yield item
            base_size_items = [item]

            #Custom printings
            if add_custom_personalization:
                team_player_name = 'WILLIAMS'
                team_player_number = '10'
                team_player_id = 'WILLIAMS'
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value(None, item)
                loader.add_value('name', team_player_name)
                loader.add_value('name', team_player_number)
                price = Decimal(item['price']) + Decimal(
                    str(custom_printings['PriceActual']))
                loader.replace_value('price', price)
                if self.free_delivery_over is not None and price >= self.free_delivery_over:
                    loader.replace_value('shipping_cost', 0)
                identifier = '-'.join(
                    (item['identifier'], str(custom_printings['PrintingID']),
                     team_player_id))
                loader.replace_value('identifier', identifier)
                custom_item = loader.load_item()
                custom_item['metadata'] = {
                    'player': team_player_name,
                    'number': team_player_number,
                    'size': size
                }
                yield custom_item
                base_size_items.append(custom_item)

            #Badges
            printing = printings.get(3)
            if not printing:
                continue
            for base_item in base_size_items:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value(None, base_item)
                loader.add_value('name', printing['PrintingDescription'])
                price = Decimal(base_item['price']) + Decimal(
                    str(printing['PriceActual']))
                loader.replace_value('price', price)
                if self.free_delivery_over is not None and price >= self.free_delivery_over:
                    loader.replace_value('shipping_cost', 0)
                identifier = base_item['identifier'] + '-' + str(
                    printing['PrintingID'])
                loader.replace_value('identifier', identifier)
                badge_item = loader.load_item()
                badge_item['metadata'] = base_item['metadata'].copy()
                yield badge_item
예제 #22
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(text=response.body_as_unicode())

        loader = ProductLoader(response=response, item=Product())

        loader.add_value('url', response.url)
        identifier = hxs.select('//input[@id="catentryId"]/@value').extract()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')

        price = ''.join(
            hxs.select('//div[@itemprop="price"]//span[@class="price"]//text()'
                       ).extract()).strip()
        loader.add_value('price', price)

        categories = hxs.select(
            '//ul[@class="breadcrumbs"]//li[not(@class="home")]/a/span/text()'
        ).extract()[1:]
        loader.add_value('category', categories)

        image_url = hxs.select('//img[@id="productMainImage"]/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), image_url[0]))

        brand = hxs.select(
            '//li[contains(text(), "BRAND")]/span/text()').extract()
        loader.add_value('brand', brand)

        item = loader.load_item()

        if not item.get('name'):
            log.msg('Using BeautifulSoup: ' + response.url)
            loader = ProductLoader(response=response, item=Product())
            soup = BeautifulSoup(response.body)

            loader.add_value('url', response.url)
            identifier = soup.find('input', attrs={'id': 'catentryId'})
            identifier = _soup_el_get_attr(identifier, 'value')
            loader.add_value('identifier', identifier)
            loader.add_value('sku', identifier)
            name = soup.find('h1', attrs={'itemprop': 'name'}).text
            loader.add_value('name', name)
            categories = [
                li.a.span.text
                for li in soup.find('ul', attrs={
                    'class': 'breadcrumbs'
                }).findAll('li') if li.a
            ][2:]
            loader.add_value('category', categories)
            price = soup.find('div', attrs={
                'itemprop': 'price'
            }).find('span', attrs={
                'class': 'price'
            }).text
            loader.add_value('price', price)

            image_url = soup.find('img', attrs={'id': 'productMainImage'})
            if image_url:
                image_url = _soup_el_get_attr(image_url, 'src')
                loader.add_value(
                    'image_url', urljoin_rfc(get_base_url(response),
                                             image_url))

            brand = ''
            for li in soup.findAll('li'):
                if 'BRAND' in li.text.upper():
                    brand = li.span.text
                    break

            loader.add_value('brand', brand)
            item = loader.load_item()
            if item['identifier']:
                yield item
        else:
            if item['identifier']:
                yield item

        if not item.get('name'):
            request = self.retry(response,
                                 "No name for product: " + response.url)
            if request:
                yield request
            return
예제 #23
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = hxs.select('//h1[@class="product-name"]/text()').extract()
        try:
            identifier = hxs.select(
                '//input[contains(@name, "ProductID")]/@value').extract()[0]
        except IndexError:
            retries = int(response.meta.get('retries', 0))
            if retries < self.max_retry_times:
                retries += 1
                req = Request(response.url,
                              meta={
                                  'retries': retries,
                                  'dont_merge_cookies': True,
                                  'cookiejar': int(time.time()),
                                  'dont_redirect': True
                              },
                              dont_filter=True,
                              callback=self.parse_product)
                yield req
            return

        sku = hxs.select('//div[contains(@class, "list-item-sku-wrap")]/text()'
                         ).re('SKU: (.*)')
        sku = sku[0].strip() if sku else ''

        price = hxs.select(
            '//div[@class="price-wrap"]/div[contains(@class, "sale") and contains(@class, "inc-vat")]/span[@itemprop="price"]/text()'
        ).extract()
        if not price:
            price = hxs.select(
                '//div[@class="price-wrap"]/div[contains(@class, "regular") and contains(@class, "inc-vat")]/text()'
            ).extract()
        price = extract_price(price[0])

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('price', price)
        in_stock = 'IN STOCK' in ''.join(
            hxs.select('//span[contains(@class, "stock-hint")]/text()').
            extract()).strip().upper()
        if not in_stock:
            loader.add_value('stock', 0)
        loader.add_value('identifier', identifier)
        loader.add_value('name', name)
        categories = hxs.select(
            '//span[@class="SectionTitleText"]/a/text()').extract()
        loader.add_value('category', categories)

        brand = hxs.select('//ul/li[contains(text(), "Brand:")]/text()').re(
            'Brand: (.*)')
        brand = brand[0].strip() if brand else ''
        loader.add_value('brand', brand)

        loader.add_value('sku', sku)
        loader.add_value('url', response.url)

        image_url = hxs.select(
            '//img[contains(@class, "product-image")]/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        if loader.get_output_value('price') >= 50:
            loader.add_value('shipping_cost', 0)
        else:
            loader.add_value('shipping_cost', 3.99)
        item = loader.load_item()
        options = hxs.select('//select[@id="variantSelector"]/option')
        if options:
            for option in options:
                option_item = deepcopy(item)
                option_id = option.select('@value').extract()[0]
                option_name = option.select('text()').extract()
                if not option_name or option_item['name'].upper(
                ) not in option_name[0].upper():
                    option_name = option_item['name'] + ' ' + ''.join(
                        hxs.select('//div[@class="misc-text-promo"]/text()').
                        extract()).strip()
                else:
                    option_name = option_name[0]
                price = hxs.select(
                    '//div[@id="variant-info-' + option_id +
                    '"]/div[@class="price-wrap"]/div[contains(@class, "sale") and contains(@class, "inc-vat")]/span[@itemprop="price"]/text()'
                ).extract()
                if not price:
                    price = hxs.select(
                        '//div[@id="variant-info-' + option_id +
                        '"]/div[@class="price-wrap"]/div[contains(@class, "regular") and contains(@class, "inc-vat")]/text()'
                    ).extract()
                price = extract_price(price[0])
                price = (price / Decimal('1.2')).quantize(Decimal('1.00'))
                option_item['price'] = price
                option_item['name'] = option_name.strip()
                option_item[
                    'identifier'] = option_item['identifier'] + '-' + option_id
                yield option_item
        else:
            yield item
예제 #24
0
 def _start_requests(self):
     yield Request(
         'http://www.comtech.de/Computer-und-Zubehoer/Eingabegeraete/Maeuse/Logitech-Performance-Maus-MX',
         meta={'product': Product()},
         callback=self.parse_product)
예제 #25
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response=response)

        name = hxs.select('//h1/span[@itemprop="name"]/text()').extract()[0]

        price = ''.join(''.join(
            hxs.select(
                '//form//p[@class="special-price"]//span[@class="price"]/text()'
            ).extract()).split())
        if not price:
            price = ''.join(''.join(
                hxs.select(
                    '//span[@class="regular-price"]//span[@class="price"]/text()'
                ).extract()).split())
        price = extract_price(price)

        brand = ''
        categories = hxs.select(
            '//div[@itemprop="breadcrumb"]/a/text()').extract()[1:]

        l = ProductLoader(item=Product(), response=response)

        image_url = hxs.select(
            '//ul[@id="product-img-main"]//img/@src').extract()
        image_url = image_url[0] if image_url else ''

        l.add_value('image_url', image_url)
        l.add_value('url', response.url)
        l.add_value('name', name)
        l.add_value('price', price)
        l.add_value('brand', brand)
        l.add_value('category', categories)
        sku = hxs.select('//span[@itemprop="sku"]/text()').extract()
        sku = sku[0] if sku else ''
        l.add_value('sku', sku)

        identifier = hxs.select('//input[@name="product"]/@value').extract()
        l.add_value('identifier', identifier[0])

        item = l.load_item()

        promotions = hxs.select(
            '//div[@class="bb-price-group" and //span[contains(text(), "Was")]]//span/text()'
        ).extract()

        metadata = MetaData()
        metadata['Promotions'] = ' '.join(promotions) if promotions else ''
        item['metadata'] = metadata

        available_options = hxs.select(
            '//select[contains(@name, "bundle_option")]/option[not(@value="")]/@value'
        ).extract()
        if not available_options:
            available_options = hxs.select(
                '//input[contains(@id, "bundle-option") and not(@value="0" or @value="1")]/@value'
            ).extract()

        options_bundle = re.search(r'new Product.Bundle\((.*)\)',
                                   response.body)
        if options_bundle and available_options:
            log.msg('OPTION BUNDLE: ' + response.url)
            combined_options = []
            product_data = json.loads(options_bundle.groups()[0])
            for id, options in product_data['options'].iteritems():
                element_options = []
                for option_id, option in options['selections'].iteritems():
                    if option_id not in available_options:
                        continue

                    option_name = hxs.select('//option[@value="' + option_id +
                                             '"]/text()').extract()
                    if not option_name:
                        option_name = hxs.select(
                            '//li[input[@value="' + option_id +
                            '"]]//label/text()').extract()
                    option_name = option_name[0].split(u'\xa0')[0].strip()

                    option_price = option['priceInclTax']
                    option_attr = (option_id, option_name, option_price)
                    element_options.append(option_attr)
                combined_options.append(element_options)

            combined_options = [
                combined_option for combined_option in combined_options
                if combined_option
            ]
            combined_options = list(itertools.product(*combined_options))
            options = []
            for combined_option in combined_options:
                final_option = {}
                for option in combined_option:
                    final_option['desc'] = final_option.get(
                        'desc', '') + ' ' + option[1]
                    final_option['identifier'] = final_option.get(
                        'identifier', '') + '-' + option[0]
                    final_option['price'] = final_option.get('price',
                                                             0) + option[2]
                options.append(final_option)

            for option in options:
                option_item = deepcopy(item)
                option_item['identifier'] += option['identifier']
                option_item['name'] += option['desc']
                option_item['price'] += extract_price(str(option['price']))

                yield option_item
        else:
            options_config = re.search(
                r'var spConfig = new Product.Config\((.*)\)', response.body)
            if options_config:
                product_data = json.loads(options_config.groups()[0])
                products = {}
                prices = {}
                for attr in product_data['attributes'].itervalues():
                    for option in attr['options']:
                        for product in option['products']:
                            products[product] = ' - '.join(
                                (products.get(product, ''), option['label']))
                            price = option.get('price') if option.get(
                                'price', 0) else option.get('oldPrice')
                            prices[product] = prices.get(
                                product, 0) + extract_price(price)

                for option_identifier, option_name in products.iteritems():
                    option_item = deepcopy(item)

                    option_item['identifier'] += '-' + option_identifier
                    option_item['name'] += option_name
                    option_item['price'] = extract_price(
                        product_data['childProducts'][option_identifier]
                        ['finalPrice'])

                    yield option_item
            else:
                yield item
예제 #26
0
    def parse_products(self, response):
        json_data = json.loads(response.body)
        products = json.loads(json_data.get('d'))

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            try:
                brand = product_el[u'ProductManufacturer'][
                    u'TyreManufacturerName']
            except:
                brand = ''

            winter_tyre = product_el[u'ProductAttributes'][u'IsWinter']
            # skip winter tyres
            if winter_tyre:
                continue
            for tyre_brand in self.brands:
                if tyre_brand.upper() == brand.strip().upper():
                    brand = tyre_brand

            try:
                full_name = product_el[u'ProductTreadPattern'][u'TreadName']
            except:
                full_name = ''
            # Fix name changes
            if full_name in self.new_old_names:
                full_name = self.new_old_names[full_name]

            loader.add_value('name', full_name)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            identifier = product_el.get('TyreID')
            loader.add_value('url', 'http://www.tyresonthedrive.com')
            image_url = 'http://www.tyresonthedrive.com/img/treads/' + product_el[
                u'ProductTreadPattern'][u'TreadPatternImage'] + '.jpg'
            loader.add_value('image_url', image_url)
            loader.add_value('identifier', identifier)

            price = product_el[u'CheapestPriceTwoDay'][u'OneTyrePriceIncVat']
            if not price:
                loader.add_value('stock', 0)
            loader.add_value('price', price)

            metadata = MicheldeverMeta()

            metadata['aspect_ratio'] = str(
                product_el[u'ProductAttributes'][u'Profile'])
            metadata['rim'] = str(product_el[u'ProductAttributes'][u'Rim'])
            metadata['speed_rating'] = str(
                product_el[u'ProductAttributes'][u'Speed'])
            metadata['load_rating'] = str(
                product_el[u'ProductAttributes'][u'Load'])
            metadata['width'] = str(
                product_el[u'ProductAttributes'][u'Section'])
            metadata['fitting_method'] = 'Fitted'
            metadata['alternative_speed_rating'] = ''
            metadata['xl'] = 'Yes' if product_el[u'ProductAttributes'][
                u'IsExLoad'] else 'No'
            metadata['run_flat'] = 'Yes' if product_el[u'ProductAttributes'][
                u'IsRunFlat'] else 'No'

            man_mark = product_el[u'ProductAttributes'][u'OEMFitment']
            metadata['manufacturer_mark'] = find_man_mark(
                man_mark) if man_mark else ''

            metadata['full_tyre_size'] = '/'.join(
                (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
                 metadata['load_rating'], metadata['speed_rating']))
            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product
예제 #27
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        product_data = response.meta['product_data']
        width = product_data['Width']
        aspect_ratio = product_data['Aspect Ratio']
        rim = product_data['Rim']
        speed_rating = product_data['Speed rating']
        alt_speed = product_data['Alt Speed']

        name_reg = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (width, rim, speed_rating.upper())
        name_reg2 = r'(.+?)\s*%s.+%s.?[\s]*([\d+ /]+)%s\s*(.*)' % (width, rim, alt_speed.upper())
        name_reg3 = r'(.+?)\s*%s.+%s.?[\s]*(.*)' % (width, rim)
        products = hxs.select('//div[@id="product-listing"]//div[@class="product"]/..')
        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)
            try:
                url = product_el.select('.//div[@class="title"]/a/@href')[0].extract()
            except:
                continue
            loader.add_value('url', url)
            loader.add_value('identifier',
                             product_el.select(".//span[@class='addcompare']/input/@id").extract()[0].split(":")[1])
            # loader.add_value('identifier', re.search('productId_(\d+)_', url).groups()[0])
            loader.add_xpath('price', './/span[@class="prodPirce"]/text()')
            try:
                name = product_el.select('.//div[@class="title"]/a/text()')[0].extract()
            except:
                continue
            if not re.search(r'(\(.*\))', name):
                # name = name.replace('/', '')
                m = re.search(name_reg, name)
                if not m:
                    m = name_parts = re.search(name_reg2, name)
                if not m:
                    m = name_parts = re.search(name_reg3, name)

                if m:
                    name_parts = m.groups()
                else:
                    self.log('Failed parsing ' + name)
                    self.log('URL: ' + response.url)
                    self.log('Params: ' + ", ".join(map(str, [width, rim, speed_rating.upper()])))
                    continue
            else:
                name_parts = []
                name_parts.append(name.split()[0])
                load_rating_reg = re.search(r'(\d+)%s' % speed_rating.upper(), name)
                if not load_rating_reg:
                    load_rating_reg = re.search(r'(\d+)%s' % alt_speed.upper(), name)
                if not load_rating_reg:
                    self.log('Failed parsing ' + name)
                    self.log('URL: ' + response.url)
                    self.log('Params: ' + ", ".join(map(str, [width, rim, speed_rating.upper()])))
                    continue
                name_parts.append(load_rating_reg.groups()[0])
                name_parts.append(' '.join(name.split()[1:]).split('(')[0])

            loader.add_value('name', name_parts[-1].replace('XL', '').replace('ROF', '').replace('RFT', ''))
            brand = name_parts[0]
            loader.add_value('brand', unify_brand(brand))
            loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
            loader.add_xpath('image_url', './/a[contains(@class, "tyre")]/img/@src')
            m = MicheldeverMeta()
            m['aspect_ratio'] = aspect_ratio
            m['rim'] = rim
            m['width'] = width
            m['speed_rating'] = speed_rating.upper()
            m['load_rating'] = name_parts[1]
            if 'ROF' in name.upper() or 'RFT' in name.upper():
                m['run_flat'] = 'Yes'
            else:
                m['run_flat'] = 'No'

            if 'XL' in name.upper():
                m['xl'] = 'Yes'
            else:
                m['xl'] = 'No'

            m['full_tyre_size'] = '/'.join((m['width'],
                                            m['aspect_ratio'],
                                            m['rim'],
                                            m['load_rating'],
                                            m['speed_rating']))
                                            # m['alternative_speed_rating']))

            m['fitting_method'] = 'Fitted'
            m['manufacturer_mark'] = self._get_manufacturer_code(name_parts[-1])

            product = loader.load_item()
            product['metadata'] = m

            if not is_product_correct(product):
                self.log('The product is not correct: %r' % product)
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            yield product

        next_page = hxs.select('//span[@class="nextlink"]/a/@href')
        if next_page:
            yield Request(next_page.extract()[0], callback=self.parse_products, meta=response.meta)
예제 #28
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_found = hxs.select('//div[@id="primary_block"]')
        if not product_found:
            return

        product_id = hxs.select('//input[@name="id_product"]/@value').extract()[0]
        name = hxs.select('//div[@id="dfCenter"]//h1/text()').extract()[0]
        category = hxs.select('//div[@class="breadcrumb"]/a/text()').extract()[1:]
        image_url = hxs.select('//img[@id="bigpic"]/@src').extract()
        if image_url:
            image_url = image_url[0]
        product_url = response.url
        product_brand = hxs.select('//div[@id="short_description_content"]//p[1]//text()').extract()[0]
        product_brand = product_brand.replace(' di ', ' da ')
        product_brand = product_brand.replace(' by ', ' da ')
        try:
            if len(product_brand) > 20:
                product_brand = re.search(' da.+?[,.]', product_brand).group(0)
        except:
            pass
        product_brand = product_brand.split(' da ')[-1]
        product_brand = product_brand.strip().strip('.,')
        if len(product_brand) > 20:
            title = hxs.select('//title/text()').extract()[0]
            s = SequenceMatcher(a=product_brand.title(), b=title.title())
            m = s.find_longest_match(0, len(s.a), 1, len(s.b))
            product_brand = s.a[m[0]:m[0]+m[-1]].strip()
        if len(product_brand) < 7 or ' ' not in product_brand:
            product_brand = None
        currencyRate = re.search('var currencyRate\D+([\d\.]+)', response.body)
        if currencyRate:
            currencyRate = Decimal(currencyRate.group(1))
        else:
            currencyRate = 1

        taxRate = re.search("var taxRate\D+([\d\.]+)", response.body)
        if taxRate:
            taxRate = Decimal(taxRate.group(1))
        else:
            taxRate = 0

        reduction_percent = re.search("var reduction_percent\D+([\d\.]+)", response.body)
        if reduction_percent:
            reduction_percent = Decimal(reduction_percent.group(1))
        else:
            reduction_percent = 0

        reduction_price = re.search("var reduction_price\D+([\d\.]+)", response.body)
        if reduction_price:
            reduction_price = Decimal(reduction_price.group(1))
        else:
            reduction_price = 0

        productPriceTaxExcluded = re.search("var productPriceTaxExcluded\D+([\d\.]+)", response.body)
        if productPriceTaxExcluded:
            productPriceTaxExcluded = Decimal(productPriceTaxExcluded.group(1))
        else:
            productPriceTaxExcluded = 0

        idDefaultImage = re.search('var idDefaultImage = (\d+)', response.body)
        if idDefaultImage:
            idDefaultImage = idDefaultImage.group(1)

        
        if re.search('addCombination.*?;', response.body):
            # here we parse option tags for more product options.
            option_value_xpath = '//div[@id="attributes"]//select/option/@value'
            option_values = hxs.select(option_value_xpath).extract()
            option_text_xpath = '//div[@id="attributes"]//select/option//text()'
            option_texts = hxs.select(option_text_xpath).extract()

            # build the lookup table.
            options = {}
            for i in range(len(option_values)):
                options[option_values[i]] = option_texts[i]

            # addCombination(5631, new Array('259'), 11, 109.99, 0, -1, 'GGT3050', 0.00, 1);
            for x in re.finditer('addCombination\((.*?)\);', response.body):
                s = x.group(0).split(',')
                offset = Decimal(s[-6])

                # determining place of options keys
                option_key_start = 1
                option_key_end = len(s) - 7

                # parsing option keys
                option_texts = []
                opt = ''
                for i in range(option_key_start, option_key_end):
                    try:
                        opt = re.sub('[^\d]+', '', s[i])
                        option_text = options[opt]
                    except:
                        pass
                    if len(option_text) > 0:
                        option_texts.append(option_text.strip())

                price = productPriceTaxExcluded + offset * currencyRate
                tax = (taxRate / Decimal('100')) + 1
                price = price * tax
                reduction = Decimal('0')
                if reduction_price or reduction_percent:
                    reduction = price * (reduction_percent / Decimal('100')) + reduction_price
                    price = price - reduction
                price = round(price, 2)
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('url', product_url)
                loader.add_value('name', name + ' ' + ' '.join(option_texts))

                image_id = s[-4].strip(" '")
                if image_url and image_id != "-1" and image_id != idDefaultImage:
                    loader.add_value('image_url', image_url.replace('-' + idDefaultImage + '-', '-' + image_id + '-'))
                else:
                    loader.add_value('image_url', image_url)

                loader.add_value('brand', product_brand)
                loader.add_value('price', price)
                loader.add_value('category', category)
                loader.add_value('identifier', '%s-%s' % (product_id, re.search(r'(\d+)', s[0]).group(1)))
                loader.add_value('sku', s[-3].strip("' ").decode('utf8'))

                yield loader.load_item()
        else:
            loader = ProductLoader(response=response, item=Product())
            loader.add_value('url', product_url)
            loader.add_value('name', name)
            loader.add_value('image_url', image_url)
            loader.add_xpath('price', '//*[@id="our_price_display"]/text()', lambda x: extract_price_eu(x[0]) if x else Decimal('0'))
            loader.add_value('category', category)
            loader.add_value('identifier', product_id)
            loader.add_xpath('sku', '//*[@id="product_reference"]/span/text()')
            loader.add_value('brand', product_brand)

            yield loader.load_item()
예제 #29
0
 def _start_requests(self):
     yield Request('http://www.banneke.com/Whisky/Whiskey/International/Amrut_Malt_Whisky_aus_Indien_46_0.70', callback=self.parse_product, meta={'product': Product()})
예제 #30
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        many = hxs.select(
            '//div[contains(@class,"product-listing")]//h3/a/@href').extract()
        if not many:
            many = hxs.select(
                '//div[contains(@class,"listing-product")]//h3/a/@href'
            ).extract()
        if many:
            for url in many:
                yield Request(urljoin(get_base_url(response), url),
                              callback=self.parse_product)
            return

        price = hxs.select(
            '//span[@class="now"]/span[@itemprop="price"]/text()').extract()
        if not price:
            price = hxs.select(
                '//div[@class="product-price"]//span[@itemprop="price"]/text()'
            ).extract()
        if not price:
            if response.meta.get('tries', 0) < 3:
                self.log("Try: %s. Retrying page: %s" %
                         (response.meta.get('tries', 0) + 1, response.url))
                yield Request(response.url,
                              callback=self.parse_product,
                              dont_filter=True,
                              meta={
                                  'handle_httpstatus_list': [404],
                                  'tries': response.meta.get('tries', 0) + 1
                              })
                return
            else:
                self.log('Gave up trying: %s' % response.url)
                self.log('No price found on page: %s' % response.url)
                return
        else:
            price = price[0]

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('identifier',
                         'substring(//h2[@id="manu"]/@content, 5)')
        loader.add_xpath('identifier', '//strong[@itemprop="mpn"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1/text()')
        loader.add_value('price', extract_price(price))
        loader.add_xpath('sku', 'substring(//h2[@id="manu"]/@content, 5)')
        loader.add_xpath('sku', '//strong[@itemprop="mpn"]/text()')
        loader.add_xpath(
            'category', '//div[contains(@class, "breadcrumb")]//a/span/text()')

        img = hxs.select('//img[@itemprop="image"]/@src').extract()
        if img:
            loader.add_value('image_url',
                             urljoin(get_base_url(response), img[0]))

        loader.add_xpath(
            'brand',
            '//div[@itemprop="brand"]/meta[@itemprop="name"]/@content')
        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', '2.99')
        else:
            loader.add_value('shipping_cost', 0)

        loader.add_xpath('stock', '//span[@itemprop="quantity"]/text()')

        yield loader.load_item()
예제 #31
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url
        sku = response.meta['sku']
        sec_sku = response.meta['notes']
        name = response.meta['name'].encode('ascii', 'ignore')

        main_product = hxs.select("//div[@id='Product-MainProduct']")
        main_products = hxs.select("//div[@id='Product-MainProductContainer']//div[@class='Product-SubProduct']")
        secondary_products = hxs.select("//div[@id='Product-SubProductContainer']//div[@class='Product-SubProduct']")

        main_product_sku = main_product.select("div[@id='Product-lblItem']/span[@id='lblItem']/text()").extract()
        if not main_product_sku:
            logging.error("NO MAIN SKU! %s" % url)
        else:
            main_product_sku = main_product_sku[0]

        if main_product_sku == sku or main_product_sku == sec_sku:
            # extract main product
            price = main_product.select(".//div[@class='Product-Price']/span[@id='lblClubPrice']/b/font/text()").re("\$(.*)")
            if not price:
                logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' % (sku, name, url))
                return
            price = price[0].strip()

            product = Product()
            loader = ProductLoader(item=product, response=response, selector=hxs)
            loader.add_value('url', url)
            loader.add_value('name', name)
            loader.add_value('price', price)

            loader.add_value('sku', sku)

            yield loader.load_item()
            return
        elif main_products:
            for product in main_products:
                product_sku = product.select("div[@class='Product-SubProductNumber']/font/text()").re("#(.+)")
                if not product_sku:
                    logging.error("NO MAIN SKU! %s" % url)
                else:
                    product_sku = product_sku[0]

                if product_sku == sku or product_sku == sec_sku:
                    # extract secondary product
                    price = product.select(".//span[contains(@id, 'lblClubPrice')]/b/font/text()").re("\$(.*)")
                    if not price:
                        logging.error('ERROR!! NO SEC PRICE!! %s "%s" "%s"' % (sku, name, url))
                        return
                    price = price[0].strip()

                    product = Product()
                    loader = ProductLoader(item=product, response=response, selector=hxs)
                    loader.add_value('url', url)
                    loader.add_value('name', name)
                    loader.add_value('price', price)

                    loader.add_value('sku', sku)

                    yield loader.load_item()
                    return
        elif secondary_products:
            for product in secondary_products:
                product_sku = product.select("div[@class='Product-SubProductNumber']/text()").re("#(.+)")
                if not product_sku:
                    logging.error("NO SECONDARY SKU! %s" % url)
                else:
                    product_sku = product_sku[0]

                if product_sku == sku or product_sku == sec_sku:
                    # extract secondary product
                    price = product.select(".//span[contains(@id, 'lblClubPrice2')]/b/font/text()").re("\$(.*)")
                    if not price:
                        logging.error('ERROR!! NO SEC PRICE!! %s "%s" "%s"' % (sku, name, url))
                        return
                    price = price[0].strip()

                    product = Product()
                    loader = ProductLoader(item=product, response=response, selector=hxs)
                    loader.add_value('url', url)
                    loader.add_value('name', name)
                    loader.add_value('price', price)

                    loader.add_value('sku', sku)

                    yield loader.load_item()
                    return
        else:
            logging.error("No products found!")