Python x_class 예제들, scrapies.utils.x_class Python 예제들

예제 #1

0

파일 보기

def get_rue_du_commerce_prices(response):
    x_price = response.xpath('//div[' + u.x_class('productBuy') + ']')

    price_old = x_price.xpath('.//div[' + u.x_class('discount-prices') +
                              ']//p[' + u.x_class('price') +
                              ']/text()').extract_first()
    if price_old is not None:
        price_old = u.string_to_float(
            price_old[:-1].encode('utf-8').strip().replace(" ", "").replace(
                " ", ""))

    price = x_price.xpath('.//div[' + u.x_class('price main') +
                          ']/p/text()').extract_first()
    price_cent = x_price.xpath('.//div[' + u.x_class('price main') +
                               ']/p/sup/text()').extract_first()

    currency = None
    if price is not None:
        currency = u.get_currency_code(price_cent[:1].strip())

    if price is not None:
        if price_cent is not None:
            price = u.string_to_float(
                (price.encode('utf-8').strip() +
                 "," + price_cent[1:].encode('utf-8').strip()).replace(
                     " ", "").replace(" ", ""))
        else:
            price = u.string_to_float(
                price.encode('utf-8').strip().replace(" ",
                                                      "").replace(" ", ""))

    return price, price_old, currency

예제 #2

0

파일 보기

def get_ldlc_prices(response):
    x_price = response.xpath('//span[' + u.x_class('blocprix') + ']')

    price_old = x_price.xpath('.//span[' + u.x_class('refPrice') +
                              ']/text()').extract_first()
    if price_old is not None:
        price_old = u.string_to_float(
            price_old[:-1].encode('utf-8').strip().replace(" ", "").replace(
                " ", ""))

    price = x_price.xpath('.//span[' + u.x_class('price') +
                          ']/text()').extract_first()
    price_cent = x_price.xpath('.//span[' + u.x_class('price') +
                               ']/sup/text()').extract_first()

    currency = None
    if price is not None:
        currency = u.get_currency_code(price[-1:].strip())

    if price is not None:
        if price_cent is not None:
            price = u.string_to_float(
                (price[:-1].encode('utf-8').strip() + "," +
                 price_cent.encode('utf-8').strip()).replace(" ", "").replace(
                     " ", ""))
        else:
            price = u.string_to_float(
                price[:-1].encode('utf-8').strip().replace(" ", "").replace(
                    " ", ""))

    return price, price_old, currency

예제 #3

0

파일 보기

def get_materiel_net_prices(response):
    x_price = response.xpath('//div[@id="ProdInfoPrice"]')

    price_old = x_price.xpath('./div[' + u.x_class('prixReference') +
                              ']/text()').extract_first()
    if price_old is not None:
        price_old = u.string_to_float(
            re.sub(' \D*$', '',
                   price_old.encode('utf-8').strip()).replace(" ", "").replace(
                       " ", ""))

    price = x_price.xpath('./span[' + u.x_class('hidden') +
                          ']/text()').extract_first()

    currency = None
    if price is not None:
        currency = u.get_currency_code(
            re.sub('^.*\d | [^ ]*$', '', price.strip()))

    if price is not None:
        price = u.string_to_float(
            re.sub(' \D*$', '',
                   price.encode('utf-8').strip()).replace(" ",
                                                          "").replace(" ", ""))

    return price, price_old, currency

예제 #4

0

파일 보기

def get_fnac_prices(response):
    x_price = response.xpath('//div[' + u.x_class('f-priceBox') + ']')
    x_price_old = x_price.xpath(
        './/span[' + u.x_class('f-priceBox-price f-priceBox-price--old') + ']')
    x_price_new = x_price.xpath(
        './/span[' + u.x_class('f-priceBox-price f-priceBox-price--reco') +
        ']')

    price_old = x_price_old.xpath('./text()').extract_first()
    price_cent_old = x_price_old.xpath('./sup/text()').extract_first()
    if price_old is not None:
        if price_cent_old is not None:
            price_old = u.string_to_float(
                (price_old.encode('utf-8').strip() +
                 "," + price_cent_old[1:].encode('utf-8').strip()).replace(
                     " ", "").replace(" ", ""))
        else:
            price_old = u.string_to_float(
                price_old[:-1].encode('utf-8').strip().replace(" ",
                                                               "").replace(
                                                                   " ", ""))

    price = x_price_new.xpath('./text()').extract_first()
    price_cent = x_price_new.xpath('./sup/text()').extract_first()

    currency = None
    if price_cent is not None:
        currency = u.get_currency_code(price_cent[:1].strip())
    elif price is not None:
        currency = u.get_currency_code(price[-1:].strip())

    if price is not None:
        if price_cent is not None:
            price = u.string_to_float(
                (price.encode('utf-8').strip() +
                 "," + price_cent[1:].encode('utf-8').strip()).replace(
                     " ", "").replace(" ", ""))
        else:
            price = u.string_to_float(
                price[:-1].encode('utf-8').strip().replace(" ", "").replace(
                    " ", ""))

    return price, price_old, currency

예제 #5

0

파일 보기

def get_darty_prices(response):
    x_price = response.xpath('//div[' + u.x_class('product_infos') + ']')

    price_old = x_price.xpath('.//span[' + u.x_class('darty_prix_barre_cont') +
                              ']/span[' + u.x_class('darty_prix_barre') +
                              ']/text()').extract_first()
    price_old_cent = x_price.xpath('.//span[' +
                                   u.x_class('darty_prix_barre_cont') +
                                   ']/span[' +
                                   u.x_class('darty_cents darty_prix_barre') +
                                   ']/text()').extract_first()

    if price_old is not None:
        if price_old_cent is not None:
            price_old = u.string_to_float(
                (re.sub('\D', ' ',
                        price_old.encode('utf-8').strip()) + "," +
                 re.sub('\D', ' ',
                        price_old_cent.encode('utf-8').strip())).replace(
                            " ", "").replace(" ", ""))
        else:
            price_old = u.string_to_float(
                re.sub('\D', ' ',
                       price_old.encode('utf-8').strip()).replace(" ",
                                                                  "").replace(
                                                                      " ", ""))

    price = x_price.xpath(
        './/meta[@itemprop="price"]/@content').extract_first()
    if price is not None:
        price = u.string_to_float(price.encode('utf-8').strip())

    currency = x_price.xpath(
        './/meta[@itemprop="priceCurrency"]/@content').extract_first()
    if currency is not None:
        currency = currency.strip()

    return price, price_old, currency

예제 #6

0

파일 보기

def get_cdiscount_prices(response):
    x_price = response.xpath('//div[@id="fpBlocPrice"]')

    price_old = x_price.xpath('.//span[' + u.x_class('fpStriked') +
                              ']/text()').extract_first()
    if price_old is not None:
        price_old = u.string_to_float(
            re.sub(' .*$', '',
                   price_old.encode('utf-8').strip()).replace(" ", "").replace(
                       " ", ""))

    price = x_price.xpath(
        './/span[' + u.x_class('fpPrice price jsMainPrice jsProductPrice') +
        ']/@content').extract_first()
    if price is not None:
        price = u.string_to_float(
            price.encode('utf-8').strip().replace(" ", "").replace(" ", ""))

    currency = x_price.xpath(
        './/meta[@itemprop="priceCurrency"]/@content').extract_first()
    if price is not None:
        currency = currency.strip()

    return price, price_old, currency

예제 #7

0

파일 보기

def get_auchan_prices(response):
    x_price = response.xpath('//div[' + u.x_class('pricesBlock') + ']')

    price_old = x_price.xpath('.//del[' +
                              u.x_class('product-price--oldPrice') +
                              ']/text()').extract_first()
    if price_old is not None:
        price_old = u.string_to_float(
            re.sub(' [^ ]*$', '',
                   price_old.encode('utf-8').strip()).replace(" ", "").replace(
                       " ", ""))

    price = x_price.xpath(
        './/meta[@itemprop="price"]/@content').extract_first()
    if price is not None:
        price = u.string_to_float(
            price.encode('utf-8').strip().replace(" ", "").replace(" ", ""))

    currency = x_price.xpath(
        './/meta[@itemprop="priceCurrency"]/@content').extract_first()
    if currency is not None:
        currency = currency.strip()

    return price, price_old, currency

예제 #8

0

파일 보기

def get_boulanger_prices(response):
    x_info = response.xpath('//div[' + u.x_class('informations') + ']')
    x_price = x_info.xpath('.//div[' + u.x_class('price') + ']')

    price_old = x_price.xpath('./span[' +
                              u.x_class('productStrikeoutPrice on') +
                              ']//span[' + u.x_class('exponent') +
                              ']/text()').extract_first()
    price_cent_old = x_price.xpath('./span[' +
                                   u.x_class('productStrikeoutPrice on') +
                                   ']//sup/span[' + u.x_class('fraction') +
                                   ']/text()').extract_first()
    if price_old is not None:
        if price_cent_old is not None:
            price_old = u.string_to_float(
                (price_old.encode('utf-8').strip() +
                 "," + price_cent_old.encode('utf-8').strip()).replace(
                     " ", "").replace(" ", ""))
        else:
            price_old = u.string_to_float(
                price_old.encode('utf-8').strip().replace(" ",
                                                          "").replace(" ", ""))

    price = x_price.xpath('./p/span[' + u.x_class('exponent') +
                          ']/text()').extract_first()
    price_cent = x_price.xpath('./p/sup/span[' + u.x_class('fraction') +
                               ']/text()').extract_first()
    if price is not None:
        if price_cent is not None:
            price = u.string_to_float(
                (price.encode('utf-8').strip() + "," +
                 price_cent.encode('utf-8').strip()).replace(" ", "").replace(
                     " ", ""))
        else:
            price = u.string_to_float(
                price.encode('utf-8').strip().replace(" ",
                                                      "").replace(" ", ""))

    currency = x_price.xpath('./p/sup/text()').extract_first()
    if currency is not None:
        currency = u.get_currency_code(currency.strip())

    return price, price_old, currency

예제 #9

0

파일 보기

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath(
            '//body[@id="darty_liste_produit"]//div[@id="main_pagination_top"]/div['
            + u.x_class('darty_product_list_pages_list') + ']')
        if x_pagination:
            url_next_page = x_pagination.xpath(
                './a[text()=" Page suivante"][last()]/@href').extract_first()
            if url_next_page is not None:
                yield Request(self.base_url + url_next_page.strip(),
                              callback=self.parse)

        # Yield product pages.
        x_list = response.xpath(
            '//body[@id="darty_liste_produit"]//div[@id="main_products_list"]')
        if x_list:
            urls = x_list.xpath('.//div[' + u.x_class('infos_container') +
                                ']/h2/a/@href').extract()
            for url in urls:
                url = self.base_url + url.strip()
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled:
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//body[@id="page_product"]')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//ul[@id="dartyCom_fil_ariane"]')

            main_category = x_categories.xpath(
                './li[2]/a/text()').extract_first()
            if main_category is not None:
                main_category = main_category.strip()

            categories = x_categories.xpath(
                './li[position() >= 3 and position() < last()]/a/text()'
            ).extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Brand
            brand = response.xpath(
                '//a[@id="darty_product_brand"]/text()').extract_first()
            if brand is not None:
                brand = brand.strip()

            # Name
            name = re.sub(
                ' +', ' ', ''.join(
                    response.xpath('//h1[' + u.x_class('product_head') +
                                   ']//div[' + u.x_class('product_name') +
                                   ']/span//text()').extract()).replace(
                                       '\n', '').replace('\r', '').strip())

            # Price
            price, price_old, currency = p.get_darty_prices(response)

            # Image
            src = response.xpath(
                '//div[' +
                u.x_class('darty_product_picture_main_pic_container') +
                ']/div[1]//img/@src').extract_first()
            if src is not None:
                src = src.strip()

            # Avis
            x_avis = response.xpath('//div[' +
                                    u.x_class('bloc_reviews_resume') + ']')

            rate = x_avis.xpath(
                '//meta[@itemprop="ratingValue"]/@content').extract_first()
            if rate is not None:
                rate = u.string_to_float(rate.strip())

            max_rate = x_avis.xpath('//div[' + u.x_class('bloc_reviews_note') +
                                    ']/sub/text()').extract_first()
            if max_rate is not None:
                max_rate = int(re.sub('\D', ' ', max_rate.strip()))

            nb_avis = x_avis.xpath(
                '//meta[@itemprop="ratingCount"]/@content').extract_first()
            if nb_avis is not None:
                nb_avis = int(re.sub('\D', ' ', nb_avis.strip()))

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = main_category
            item['categories'] = categories
            item['brand'] = brand
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = max_rate
            item["nb_avis"] = nb_avis
            item["price_history"] = [{
                'date': time.strftime("%Y/%m/%d"),
                'price_old': price_old,
                'price': price,
                'currency': currency
            }]

            yield item

예제 #10

0

파일 보기

파일: fnac_spider.py 프로젝트: Pierre-Lefevre/INeedDisAtDisPrice-server

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath('//ul[' +
                                      u.x_class('Pager bottom-toolbar') + ']')
        if x_pagination:
            url_next_page = x_pagination.xpath(
                './/a[' + u.x_class('prevnext actionNext') +
                ']/@href').extract_first()
            if url_next_page is not None:
                yield Request(url_next_page, callback=self.parse)

        # Yield product pages.
        x_list = response.xpath('//ul[' + u.x_class('articleList') + ']')
        if x_list:
            urls = x_list.xpath('.//p[' + u.x_class('Article-desc') +
                                ']/a/@href').extract()
            for url in urls:
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled:
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//div[' + u.x_class('f-productPage') + ']')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//ul[' + u.x_class('f-breadcrumb') +
                                          ']')

            main_category = x_categories.xpath(
                './li[2]/a/text()').extract_first()
            if main_category is not None:
                main_category = main_category.strip()

            categories = x_categories.xpath(
                './li[position() >= 3]/a/text()').extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Name
            name = response.xpath('//h1[' +
                                  u.x_class('f-productHeader-Title') +
                                  ']/text()').extract_first().strip()

            # Price
            price, price_old, currency = p.get_fnac_prices(response)

            # Image
            src = response.xpath('//img[' +
                                 u.x_class('f-productVisuals-mainMedia') +
                                 ']/@src').extract_first()
            if src is not None:
                src = src.strip()

            # Avis
            x_avis = response.xpath('//div[' + u.x_class('f-review-header') +
                                    ']')

            rate = x_avis.xpath('.//div[' + u.x_class('f-review-headerRate') +
                                ']/text()').extract_first()
            if rate is not None:
                rate = u.string_to_float(rate.strip())

            max_rate = x_avis.xpath('.//span[' +
                                    u.x_class('f-review-headerRateTotal') +
                                    ']/text()').extract_first()
            if max_rate is not None:
                max_rate = u.string_to_float(max_rate.strip().replace("/", ""))

            nb_avis = response.xpath('//div[' +
                                     u.x_class('f-productHeader-review') +
                                     ']//span[' +
                                     u.x_class('f-productHeader-reviewLabel') +
                                     ']/text()').extract_first()
            if nb_avis is not None:
                nb_avis = u.string_to_float(re.sub("\D", "", nb_avis.strip()))

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = main_category
            item['categories'] = categories
            item['brand'] = None
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = max_rate
            item["nb_avis"] = nb_avis
            item["price_history"] = [{
                'date': time.strftime("%Y/%m/%d"),
                'price_old': price_old,
                'price': price,
                'currency': currency
            }]

            if src == self.src_no_image:
                copyfile(
                    "data/default.jpg", "data/" + self.name + "/img/" +
                    item["image_name"] + ".jpg")

            yield item

예제 #11

0

파일 보기

파일: auchan_spider.py 프로젝트: Pierre-Lefevre/INeedDisAtDisPrice-server

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath('//nav[' + u.x_class('ui-pagination') + ']')
        if x_pagination:
            url_next_page = x_pagination.xpath('.//a[' + u.x_class('ui-pagination--next') + ']/@href').extract_first()
            if url_next_page is not None:
                yield Request(self.base_url + url_next_page.strip(), callback=self.parse)

        # Yield product pages.
        x_list = response.xpath('//div[' + u.x_class('product-list--container') + ']')
        if x_list:
            urls = x_list.xpath('.//div[' + u.x_class('product-item--wrapper') + ']/a/@href').extract()
            for url in urls:
                url = self.base_url + url.strip()
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled:
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//div[' + u.x_class('product-detail') + ']')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//div[' + u.x_class('ui-breadcrumb--scroller') + ']/nav')

            main_category = x_categories.xpath('./span[2]/meta[@itemprop="name"]/@content').extract_first()
            if main_category is not None:
                main_category = main_category.strip()

            categories = x_categories.xpath('./span[position() >= 3 and position() < last()]/meta[@itemprop="name"]/@content').extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Brand
            x_brand_name = response.xpath('//div[' + u.x_class('product-detail--wrapper') + ']')

            brand = x_brand_name.xpath('./meta[@itemprop="brand"]/@content').extract_first()
            if brand is not None:
                brand = brand.strip()

            # Name
            name = x_brand_name.xpath('./h1[' + u.x_class('product-detail--title') + ']/text()').extract_first().replace('\n', '').replace('\r', '').strip()

            # Price
            price, price_old, currency = p.get_auchan_prices(response)

            # Image
            src = response.xpath('//div[' + u.x_class('x-scroller') + ']/label[1]//img/@src').extract_first()
            if src is not None:
                src = src.strip()

            # Avis
            x_avis = response.xpath('//div[' + u.x_class('product-detail--rating') + ']')

            rate = x_avis.xpath('.//meta[@itemprop="ratingValue"]/@content').extract_first()
            if rate is not None:
                rate = u.string_to_float(rate.strip())

            nb_avis = x_avis.xpath('.//meta[@itemprop="reviewCount"]/@content').extract_first()
            if nb_avis is not None:
                nb_avis = int(nb_avis.strip())

            max_rate = x_avis.xpath('.//span[' + u.x_class('ui-rating--background') + ']/i[' + u.x_class('icon-auchan-82') + ']').extract()
            max_rate = len(max_rate) if max_rate else None

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = main_category
            item['categories'] = categories
            item['brand'] = brand
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = max_rate
            item["nb_avis"] = nb_avis
            item["price_history"] = [{'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency}]

            yield item

예제 #12

0

파일 보기

파일: cdiscount_spider.py 프로젝트: Pierre-Lefevre/INeedDisAtDisPrice-server

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath('//ul[@id="PaginationForm_ul"]')
        if x_pagination and self.first:
            self.first = False
            nb_page = x_pagination.xpath('./li[last()]/a/text()').extract_first()
            if nb_page is not None:
                for x in range(1, int(nb_page.strip())):
                    yield Request(response.url[:-5] + "-" + str(x) + response.url[-5:], callback=self.parse)

        # Yield product pages.
        x_list = response.xpath('//ul[@id="lpBloc"]')
        if x_list:
            urls = x_list.xpath('.//div[' + u.x_class('prdtBILDetails') + ']/a/@href').extract()
            for url in urls:
                url = url.strip()
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled and self.nb_crawled < 300 :
                    self.nb_crawled += 1
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//h1[@itemprop="name"]')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//div[@id="bc"]')

            main_category = x_categories.xpath('.//li[3]//span/text()').extract_first()
            if main_category is not None:
                main_category = main_category.strip()

            categories = x_categories.xpath('.//li[position() >= 4 and position() < last()]//span/text()').extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Brand
            brand = response.xpath('//table[' + u.x_class('fpDescTb fpDescTbPub') + ']//span[@itemprop="brand"]//span[@itemprop="name"]/text()').extract_first()
            if brand is not None:
                brand = brand.strip()

            # Name
            name = re.sub(' +', ' ', x_product.xpath('./text()').extract_first().strip())

            # Price
            price, price_old, currency = p.get_cdiscount_prices(response)

            # Image
            src = response.xpath('//div[' + u.x_class('fpMainImg') + ']/a[@itemprop="image"]/@href').extract_first()
            if src is not None:
                src = src.strip()

            # Avis
            x_avis = response.xpath('//div[' + u.x_class('topMainRating') + ']')

            rate = x_avis.xpath('//span[@itemprop="ratingValue"]/text()').extract_first()
            if rate is not None:
                rate = u.string_to_float(rate.strip())

            nb_avis = x_avis.xpath('//span[@itemprop="ratingCount"]/text()').extract_first()
            if nb_avis is not None:
                nb_avis = int(nb_avis.strip())

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = main_category
            item['categories'] = categories
            item['brand'] = brand
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = 5
            item["nb_avis"] = nb_avis
            item["price_history"] = [{'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency}]

            yield item

예제 #13

0

파일 보기

파일: boulanger_spider.py 프로젝트: Pierre-Lefevre/INeedDisAtDisPrice-server

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath('//div[' + u.x_class('navigationListe') +
                                      ']')
        if x_pagination:
            url_next_page = x_pagination.xpath(
                './/span[' + u.x_class('navPage navPage-right') +
                ']/a/@href').extract_first()
            if url_next_page is not None:
                yield Request(self.base_url + url_next_page.strip(),
                              callback=self.parse)

        # Yield product pages.
        x_list = response.xpath('//div[' + u.x_class('productListe') + ']')
        if x_list:
            urls = x_list.xpath('.//div[' + u.x_class('designations') +
                                ']/h2/a/@href').extract()
            for url in urls:
                url = self.base_url + url.strip()
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled:
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//h1[@itemprop="name"]')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//div[@id="filAriane"]')

            main_category = x_categories.xpath(
                './/li[2]//a/text()').extract_first()
            if main_category is not None:
                main_category = main_category.strip()

            categories = x_categories.xpath(
                './/li[position() >= 3 and position() <= last()]//a/text()'
            ).extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Name
            name = re.sub(
                ' +', ' ',
                ''.join(x_product.xpath('./text()').extract()).replace(
                    '\n', '').replace('\r', '').strip())

            # Price
            price, price_old, currency = p.get_boulanger_prices(response)

            # Image
            src = response.xpath(
                '//span[@itemprop="gtin13"]/text()').extract_first()
            if src is not None:
                src = "https://boulanger.scene7.com/is/image/Boulanger/" + src.strip(
                ) + "_h_f_l_0"

            # Avis
            x_avis = response.xpath('//div[' + u.x_class('top') + ']/div[' +
                                    u.x_class('right') + ']//span[' +
                                    u.x_class('rating') + ']')

            rate = x_avis.xpath('./@class').extract_first()
            if rate is not None:
                rate = re.sub('\D', '', rate.strip())
                if rate != "0":
                    if len(rate) > 1:
                        rate = rate[:1] + "," + rate[1:]
                    rate = u.string_to_float(rate)
                else:
                    rate = None

            nb_avis = x_avis.xpath('./span[' + u.x_class('link') +
                                   ']/text()').extract_first()
            if nb_avis is not None:
                nb_avis = int(re.sub('\D', '', nb_avis.strip()))

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = main_category
            item['categories'] = categories
            item['brand'] = None
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = 5
            item["nb_avis"] = nb_avis
            item["price_history"] = [{
                'date': time.strftime("%Y/%m/%d"),
                'price_old': price_old,
                'price': price,
                'currency': currency
            }]

            yield item

예제 #14

0

파일 보기

파일: materiel_net_spider.py 프로젝트: Pierre-Lefevre/INeedDisAtDisPrice-server

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath('//ul[' +
                                      u.x_class('pagination pagination-sm') +
                                      ']')
        if x_pagination:
            url_next_page = x_pagination.xpath(
                './li[position() = last()]/a/@href').extract_first()
            if url_next_page is None:
                url_next_page = x_pagination.xpath(
                    './li[position() = (last() - 1)]/a/@href').extract_first()
            if url_next_page is not None:
                yield Request(self.base_url + url_next_page,
                              callback=self.parse)

        # Yield product pages.
        x_list = response.xpath('//table[' + u.x_class('ProdList') + ']')
        if x_list:
            urls = x_list.xpath('.//td[' + u.x_class('Photo') +
                                ']/span/@data-href').extract()
            for url in urls:
                url = self.base_url + url
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled:
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//div[@id="prod"]')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//nav[@id="breadcrumb"]')

            categories = x_categories.xpath(
                './/li[position() >= 3 and position() < last()]/a/text()'
            ).extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Brand
            brand = x_categories.xpath('.//li[2]/a/text()').extract_first()
            if brand is not None:
                brand = brand.strip()

            # Name
            name = re.sub(
                ' +', ' ', ''.join(
                    response.xpath(
                        '//h1[@id="ProdTitle"]//text()').extract()).replace(
                            '\n', '').replace('\r', '').strip())

            # Price
            price, price_old, currency = p.get_materiel_net_prices(response)

            # Image
            src = response.xpath('//div[' + u.x_class('swiper-wrapper') +
                                 ']//a/@data-zoom-image').extract_first()
            if src is None:
                src = response.xpath(
                    '//div[@id="container-image"]/@data-zoom-image'
                ).extract_first()
            if src is not None:
                src = src.strip()

            # Avis
            x_avis = response.xpath('//div[' + u.x_class('headerAvisClients') +
                                    ']')

            rate = x_avis.xpath('.//span[' + u.x_class('noteUser') +
                                ']/text()').extract_first()
            if rate is not None:
                rate = u.string_to_float(rate.strip())

            max_rate = x_avis.xpath(
                './/span[' + u.x_class('noteUser') +
                ']/following-sibling::span[1]/text()').extract_first()
            if max_rate is not None:
                max_rate = u.string_to_float(max_rate.strip())

            nb_avis = x_avis.xpath(
                './/span[@id="avisCount"]/span/text()').extract_first()
            if nb_avis is not None:
                nb_avis = int(nb_avis.strip())

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = "Informatique"
            item['categories'] = categories
            item['brand'] = brand
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = max_rate
            item["nb_avis"] = nb_avis
            item["price_history"] = [{
                'date': time.strftime("%Y/%m/%d"),
                'price_old': price_old,
                'price': price,
                'currency': currency
            }]

            yield item