Exemplo n.º 1
0
    def parse_product(self, response):
        prod = self.parse_product_base(response)
        currencyRate = re.search('var currencyRate\D+([\d\.]+)', response.body)
        if currencyRate:
            currencyRate = Decimal(currencyRate.group(1))
        else:
            currencyRate = 1

        productPriceTaxExcluded = re.search("var productPriceTaxExcluded\D+([\d\.]+)", response.body)
        if productPriceTaxExcluded:
            productPriceTaxExcluded = Decimal(productPriceTaxExcluded.group(1))
        else:
            productPriceTaxExcluded = 0

        idDefaultImage = re.search('var idDefaultImage=(\d+)', response.body)
        if idDefaultImage:
            idDefaultImage = idDefaultImage.group(1)

        data = response.xpath('//script/text()').re_first('var combinations=({.+?});')
        if not data:
            yield prod
            return
        
        data = json.loads(data)
        for identifier in data:
            loader = ProductLoader(Product(), response=response)
            loader.add_value(None, prod)
            loader.replace_value('identifier', '-'.join((prod['identifier'], identifier)))
            loader.replace_value('sku', data[identifier]['reference'])
            loader.replace_value('stock', data[identifier]['quantity'])
            option_price = Decimal(data[identifier]['price'])
            if option_price != 0:
                price = (option_price * Decimal('1.2')).quantize(Decimal('0.01'))
                loader.replace_value('price', price)
            attr_values = data[identifier]['attributes_values']
            for attr in sorted(attr_values):
                loader.add_value('name', attr_values[attr])
            image_url = prod['image_url'].replace(idDefaultImage, str(data[identifier]['id_image']))
            yield loader.load_item()
            
Exemplo n.º 2
0
    def string_to_decimal(self, price):
        was_price = None
        if price:
            was_price = (re.search(ur"Was\s€(\d+\.\d*)", price, re.I)
                         or re.search(ur"Was\s(\d+c)", price, re.I)
                         or re.search(ur"Was\s€(\d+)", price, re.I)
                         or re.search(ur"^€(\d+\.\d*)", price, re.I)
                         or re.search(ur"^€(\d+)", price, re.I)
                         or re.search(ur"^(\d+c)", price, re.I))

            if was_price:
                try:
                    was_price = Decimal(was_price.group(1))

                except Exception:
                    # cents e.g 90c
                    was_price = re.search(r"(\d+)c", was_price.group(1), re.I)
                    if was_price:
                        was_price = "0.{}".format(was_price.group(1))
                        was_price = Decimal(was_price)

        return self.decimal_to_float(was_price)
Exemplo n.º 3
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name_xpath = '//div[@id="primary_block"]/h1//text()'
        image_xpath = '//div[@id="image-block"]/img/@src'

        breadcrumb_xpath = '//div[@class="breadcrumb"]/a[last()]/text()'

        breadcrumb = hxs.select(breadcrumb_xpath).extract()
        if len(breadcrumb) > 0:
            category = breadcrumb.pop().strip()
        else:
            category = 'No category'

        name = hxs.select(name_xpath).extract().pop().strip()
        image = hxs.select(image_xpath).extract().pop()
        product_url = urljoin_rfc(base_url, response.url)
        image_url = urljoin_rfc(base_url, image)

        currencyRate = re.search('var currencyRate\D+([\d\.]+)', response.body)
        if currencyRate:
            currencyRate = Decimal(currencyRate.group(1))
        else:
            currencyRate = 1

        productPriceTaxExcluded = re.search(
            "var productPriceTaxExcluded\D+([\d\.]+)", response.body)
        if productPriceTaxExcluded:
            productPriceTaxExcluded = Decimal(productPriceTaxExcluded.group(1))
        else:
            productPriceTaxExcluded = 0

        idDefaultImage = re.search('var idDefaultImage = (\d+)', response.body)
        if idDefaultImage:
            idDefaultImage = idDefaultImage.group(1)

        if response.url.find("unlimited-telecom.fr") != -1:

            if re.search('addCombination.*?;', response.body):

                # self.log("WARNING options found")
                # here we parse option tags for more product options.
                option_value_xpath = '//div[@id="attributes"]//select/option/@value'
                option_values = hxs.select(option_value_xpath).extract()
                option_text_xpath = '//div[@id="attributes"]//select/option//text()'
                option_texts = hxs.select(option_text_xpath).extract()

                # build the lookup table.
                options = {}
                for i in range(len(option_values)):
                    options[option_values[i]] = option_texts[i]

                for x in re.finditer('addCombination.*?;', response.body):
                    s = x.group(0).split(',')
                    offset = Decimal(s[-6])

                    # determining place of options keys
                    option_key_start = 1
                    option_key_end = len(s) - 7

                    # parsing option keys
                    option_texts = []
                    opt = ''
                    for i in range(option_key_start, option_key_end):
                        try:
                            opt = re.sub('[^\d]+', '', s[i])
                            option_text = options[opt]
                        except:
                            pass
                        if len(option_text) > 0:
                            option_texts.append(option_text.strip())

                    price = productPriceTaxExcluded + offset * currencyRate
                    loader = ProductLoader(response=response, item=Product())
                    loader.add_value('url', product_url)
                    loader.add_value('name',
                                     name + ' ' + ' '.join(option_texts))

                    image_id = s[-4].strip(" '")

                    if image_id != idDefaultImage:
                        loader.add_value(
                            'image_url',
                            image_url.replace('-' + idDefaultImage + '-',
                                              '-' + image_id + '-'))
                    else:
                        loader.add_value('image_url', image_url)

                    loader.add_value('price', price)
                    loader.add_value('category', category)
                    loader.add_value('identifier',
                                     '%s_%s' % (s[-3].strip("' "), opt))
                    loader.add_value('sku', s[-3].strip("' "))
                    yield loader.load_item()

                return
            else:
                # self.log("WARNING options not found")
                prod = self.parse_product_base(response, hxs)
                if prod:
                    yield prod
                return

        elif response.url.find("unlimited-telecom.com") != -1:

            prod = self.parse_product_base(response, hxs)
            if prod:
                yield prod
            return
        else:
            self.log("ERROR unknown url: " + response.url)
            return
Exemplo n.º 4
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            u'//table[child::tr[child::td[@colspan="2" and child::h2]]]')
        for product in products:
            multiple_options = product.select(u'.//select/option')
            general_price = product.select(
                u'.//span[@class="actlarge"]/text()').extract()
            if general_price:
                if len(general_price) > 1:
                    multiplier = Decimal("1")
                    general_price = general_price[1]
                else:
                    multiplier = Decimal("1.2")
                    general_price = general_price[0]
                general_price = general_price.replace(u'\xa3', '')
                general_price = Decimal(general_price.replace(",", ""))
                general_price = general_price * multiplier
            else:
                general_price = None
            if not general_price:
                general_price = product.select(u'.//*/text()').re(
                    u'Price inc UK Mainland Carriage.*?\:.*?\xa3([\d\.,]*)')
                general_price = general_price[0] if general_price else None
                log.msg(u'Product with: Price inc UK Mainland Carriage')

            if multiple_options and general_price:
                options_text = u' '.join(
                    product.select(u'.//select/option/text()').extract())
                if u'\xa3' in options_text:
                    log.msg(
                        u'Product with both option and general price: [%s]' %
                        response.url)
            name = product.select(u'.//h2/text()')[0].extract().strip()
            if multiple_options and not general_price:
                idx = 1
                for option in multiple_options:
                    option_text = option.select(u'./text()')[0].extract()
                    loader = ProductLoader(item=Product(), selector=product)

                    price = re.search(u'\xa3([\d\.,]+)inc vat', option_text,
                                      re.I)
                    multiplier = Decimal("1")
                    if not price:
                        multiplier = Decimal("1")
                        price = re.search(u'\xa3([\d\.,]+)inc', option_text,
                                          re.I)
                    if not price:
                        multiplier = Decimal("1")
                        price = re.search(u'\(\xa3([\d\.,]+)\)?', option_text,
                                          re.I)
                    if not price:
                        multiplier = Decimal("1.2")
                        price = re.search(u'\xa3([\d\.,]+)', option_text, re.I)
                    if price:
                        price = Decimal(price.group(1).replace(
                            ",", "")) * multiplier
                    else:
                        continue
                    loader.add_value('name',
                                     name + u' %s' % option_text.strip())
                    loader.add_value('url', response.url)
                    loader.add_value('price', price)
                    m = re.search(r'\(Ref:\s*([^\)]+)\)', name, re.I)
                    if m:
                        optsku = option_text.strip().lower().replace(
                            'code', '').strip('-. ').split('-')[0]
                        # optsku = re.sub(r'\W+','',re.sub(r'.*\(ref:\s*[^\)]+\)','',re.sub(r'\xa3.*','',name.lower().replace('code',''))).strip('-. ').split('-')[0])
                        if optsku:
                            loader.add_value('sku', m.group(1) + optsku)
                        else:
                            loader.add_value('sku',
                                             m.group(1) + ".inc" + str(idx))
                            idx += 1
                        loader.add_value('identifier',
                                         loader.get_output_value('sku'))
                    yield loader.load_item()
            else:
                loader = ProductLoader(item=Product(), selector=product)
                loader.add_value('url', response.url)
                loader.add_value('name', name)
                if not general_price:
                    continue
                loader.add_value('price', general_price)
                m = re.search(r'\(Ref:\s*([^\)]+)\)',
                              loader.get_output_value('name'), re.I)
                if m:
                    loader.add_value('sku', m.group(1))
                    loader.add_value('identifier',
                                     loader.get_output_value('sku'))
                # if loader.get_output_value('price'):
                yield loader.load_item()
Exemplo n.º 5
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//h1/text()').extract()
        if not name:
            return
        else:
            name = name[0]
        identifier = hxs.select(
            '//input[@name="product_id"]/@value').extract()[0]
        price = hxs.select(
            '//div[@class="price"]/div[@id="myoc-lpu"]/text()').extract()
        if price:
            price = extract_price2uk(price[0])
            stock = 1
        else:
            price = Decimal(0)
            stock = 0

        loader = ProductLoader(selector=hxs, item=Product())
        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('stock', stock)
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_xpath('image_url', '//a[@class="thumbnail"]/img/@src')
        loader.add_value('url', response.url)
        loader.add_value('shipping_cost', 0)
        for category in hxs.select(
                '//ul[@class="breadcrumb"]/li/a/text()')[:-1].extract():
            loader.add_value('category', category)
        loader.add_xpath('brand', '//li[contains(text(), "Brand")]/a/text()')
        product = loader.load_item()

        option_boxes = hxs.select(
            '//select[@class="form-control" and contains(@id, "option")\
                        and not(contains(./option/., "V.A.T."))\
                        and not(contains(./option/., "VAT"))\
                        and not(contains(./option/., "Delivery"))]')
        if not option_boxes:
            yield product
            return

        options_dict = dict()
        options = []
        for option_box in option_boxes:
            option_group = []
            for option in option_box.select(
                    './option[@value!="" and not(contains(.,"VAT Exempt"))]'):
                option_id = option.select('./@value')[0].extract()
                option_name = option.select('./text()')[0].extract()
                option_price = re.search(u'\(\+\xa3(.*)\)', option_name)
                option_price = Decimal(
                    option_price.group(1)) if option_price else Decimal('0.00')

                option_name = re.sub('VAT Payable ?-? ?', '', option_name)
                option_name = re.sub(u'\(\+\xa3(.*)\)', '',
                                     option_name).strip()
                options_dict[option_id] = {
                    'name': option_name,
                    'price': option_price
                }
                option_group.append(option_id)
            options.append(option_group)

        options = itertools.product(*options)

        for option in options:
            option_name = ' '.join(
                [options_dict[option_id]['name'] for option_id in option])
            option_price = sum(
                [options_dict[option_id]['price'] for option_id in option])
            option = sorted(option)
            option_identifier = '-'.join(option)
            product['identifier'] = '-'.join((identifier, option_identifier))
            product['price'] = price + option_price
            product['name'] = fix_spaces(' '.join((name, option_name)))
            yield product
Exemplo n.º 6
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_found = hxs.select('//div[@id="primary_block"]')
        if not product_found:
            return

        product_id = hxs.select('//input[@name="id_product"]/@value').extract()[0]
        name = hxs.select('//div[@id="dfCenter"]//h1/text()').extract()[0]
        category = hxs.select('//div[@class="breadcrumb"]/a/text()').extract()[1:]
        image_url = hxs.select('//img[@id="bigpic"]/@src').extract()
        if image_url:
            image_url = image_url[0]
        product_url = response.url
        product_brand = hxs.select('//div[@id="short_description_content"]//p[1]//text()').extract()[0]
        product_brand = product_brand.replace(' di ', ' da ')
        product_brand = product_brand.replace(' by ', ' da ')
        try:
            if len(product_brand) > 20:
                product_brand = re.search(' da.+?[,.]', product_brand).group(0)
        except:
            pass
        product_brand = product_brand.split(' da ')[-1]
        product_brand = product_brand.strip().strip('.,')
        if len(product_brand) > 20:
            title = hxs.select('//title/text()').extract()[0]
            s = SequenceMatcher(a=product_brand.title(), b=title.title())
            m = s.find_longest_match(0, len(s.a), 1, len(s.b))
            product_brand = s.a[m[0]:m[0]+m[-1]].strip()
        if len(product_brand) < 7 or ' ' not in product_brand:
            product_brand = None
        currencyRate = re.search('var currencyRate\D+([\d\.]+)', response.body)
        if currencyRate:
            currencyRate = Decimal(currencyRate.group(1))
        else:
            currencyRate = 1

        taxRate = re.search("var taxRate\D+([\d\.]+)", response.body)
        if taxRate:
            taxRate = Decimal(taxRate.group(1))
        else:
            taxRate = 0

        reduction_percent = re.search("var reduction_percent\D+([\d\.]+)", response.body)
        if reduction_percent:
            reduction_percent = Decimal(reduction_percent.group(1))
        else:
            reduction_percent = 0

        reduction_price = re.search("var reduction_price\D+([\d\.]+)", response.body)
        if reduction_price:
            reduction_price = Decimal(reduction_price.group(1))
        else:
            reduction_price = 0

        productPriceTaxExcluded = re.search("var productPriceTaxExcluded\D+([\d\.]+)", response.body)
        if productPriceTaxExcluded:
            productPriceTaxExcluded = Decimal(productPriceTaxExcluded.group(1))
        else:
            productPriceTaxExcluded = 0

        idDefaultImage = re.search('var idDefaultImage = (\d+)', response.body)
        if idDefaultImage:
            idDefaultImage = idDefaultImage.group(1)

        
        if re.search('addCombination.*?;', response.body):
            # here we parse option tags for more product options.
            option_value_xpath = '//div[@id="attributes"]//select/option/@value'
            option_values = hxs.select(option_value_xpath).extract()
            option_text_xpath = '//div[@id="attributes"]//select/option//text()'
            option_texts = hxs.select(option_text_xpath).extract()

            # build the lookup table.
            options = {}
            for i in range(len(option_values)):
                options[option_values[i]] = option_texts[i]

            # addCombination(5631, new Array('259'), 11, 109.99, 0, -1, 'GGT3050', 0.00, 1);
            for x in re.finditer('addCombination\((.*?)\);', response.body):
                s = x.group(0).split(',')
                offset = Decimal(s[-6])

                # determining place of options keys
                option_key_start = 1
                option_key_end = len(s) - 7

                # parsing option keys
                option_texts = []
                opt = ''
                for i in range(option_key_start, option_key_end):
                    try:
                        opt = re.sub('[^\d]+', '', s[i])
                        option_text = options[opt]
                    except:
                        pass
                    if len(option_text) > 0:
                        option_texts.append(option_text.strip())

                price = productPriceTaxExcluded + offset * currencyRate
                tax = (taxRate / Decimal('100')) + 1
                price = price * tax
                reduction = Decimal('0')
                if reduction_price or reduction_percent:
                    reduction = price * (reduction_percent / Decimal('100')) + reduction_price
                    price = price - reduction
                price = round(price, 2)
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('url', product_url)
                loader.add_value('name', name + ' ' + ' '.join(option_texts))

                image_id = s[-4].strip(" '")
                if image_url and image_id != "-1" and image_id != idDefaultImage:
                    loader.add_value('image_url', image_url.replace('-' + idDefaultImage + '-', '-' + image_id + '-'))
                else:
                    loader.add_value('image_url', image_url)

                loader.add_value('brand', product_brand)
                loader.add_value('price', price)
                loader.add_value('category', category)
                loader.add_value('identifier', '%s-%s' % (product_id, re.search(r'(\d+)', s[0]).group(1)))
                loader.add_value('sku', s[-3].strip("' ").decode('utf8'))

                yield loader.load_item()
        else:
            loader = ProductLoader(response=response, item=Product())
            loader.add_value('url', product_url)
            loader.add_value('name', name)
            loader.add_value('image_url', image_url)
            loader.add_xpath('price', '//*[@id="our_price_display"]/text()', lambda x: extract_price_eu(x[0]) if x else Decimal('0'))
            loader.add_value('category', category)
            loader.add_value('identifier', product_id)
            loader.add_xpath('sku', '//*[@id="product_reference"]/span/text()')
            loader.add_value('brand', product_brand)

            yield loader.load_item()