Python productSpiderItem примеры, proto_spider.items.productSpiderItem Python примеры использования

Пример #1

0

Показать файл

    def parse_product(self, response):
        #Check the products description section on the page to make sure it is FSC Certified
        check_for_fsc_cert = response.xpath(
            '//p[text()[contains(.,"FSC")]]').extract()
        if check_for_fsc_cert == []:
            return
        product_name = response.xpath(
            '//div[@class="ml-product-name"]/div/text()').extract_first()
        product_url = response.xpath(
            '//link[@rel="alternate"]/@href').extract_first()
        image = response.xpath(
            '//div[@class="ml-product-alt-detail-image"]//img/@src'
        ).extract_first()
        #Some product pages render the product images dynamically, after scrapy has rendered page
        #This following if-statement is used to move on to next product if image was not rendered in page
        if image == None:
            return

        description = response.xpath(
            '//meta[@name="description"]/@content').extract_first()
        original_price = response.xpath(
            '//span[@class="ml-item-price"]/text()').extract_first()

        #The prices will be returned as a string, and they may also have commas and '$' in them,
        #This function is intended to remove any commas and '$' then turn the strings into integers
        def modifyPriceList(thePrice):
            price_container = []
            #Takes dollar sign and commas out of price
            price_without_commas = thePrice.replace(',', '').replace('$', '')
            #Turns price_without_commas into floating integer
            price_as_integer = float(price_without_commas)
            price_container.append(price_as_integer)
            return price_container

        final_price = modifyPriceList(original_price)
        #Most of the eco-products on France and Son are FSC Certified
        #Currently no Greenguard Products have been noted for this store
        certifications = [{
            "certification":
            "Forest Stewardship (FSC) Certified",
            "title":
            "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests.",
            "url":
            "https://us.fsc.org/en-us/what-we-do/mission-and-vision"
        }]

        load_item = productSpiderItem()

        load_item['sitename'] = 'One Kings Lane'
        load_item['productname'] = product_name
        load_item['producturl'] = product_url
        load_item['image'] = image
        load_item['price'] = final_price
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = final_price[0]
        #Returns object with each load_item included
        yield load_item

Пример #2

0

Показать файл

Файл: franceandson.py Проект: r-wirt/eco-furniture-crawlers

    def parse_product(self, response):

        #Check the product page to see if it includes the sold out tag
        sold_out = response.xpath(
            '//span[@class="product--badge badge--soldout"]/text()').extract()
        #If the page does contain sold out tag end the parse_product function
        if sold_out == []:
            return

        product_name = response.xpath(
            '//meta[@property="og:title"]/@content').extract_first()
        product_url = response.xpath(
            '//meta[@property="og:url"]/@content').extract_first()
        image = response.xpath(
            '//meta[@property="og:image"]/@content').extract_first()
        description = response.xpath(
            '//meta[@property="og:description"]/@content').extract_first()
        original_price = response.xpath(
            '//meta[@property="og:price:amount"]/@content').extract_first()

        #The prices will be returned as a string, and they may also have commas in them,
        #This function is intended to remove any commas and turn the strings into integers
        def modifyPriceList(thePrice):
            price_container = []
            #Takes commas out of price
            price_without_commas = thePrice.replace(',', '')
            #Turns price_without_commas into floating integer
            price_as_integer = float(price_without_commas)
            price_container.append(price_as_integer)
            return price_container

        final_price = modifyPriceList(original_price)
        #Most of the eco-products on France and Son are FSC Certified
        #Currently no Greenguard Products have been noted for this store
        certifications = [{
            "certification":
            "Forest Stewardship (FSC) Certified",
            "title":
            "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests.",
            "url":
            "https://us.fsc.org/en-us/what-we-do/mission-and-vision"
        }]

        load_item = productSpiderItem()

        load_item['sitename'] = 'France and Son'
        load_item['productname'] = product_name
        load_item['producturl'] = product_url
        load_item['image'] = image
        load_item['price'] = final_price
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = final_price[0]
        #Returns object with each load_item included
        yield load_item

Пример #3

0

Показать файл

Файл: stemgoods.py Проект: r-wirt/eco-furniture-crawlers

    def parse_product(self, response):
        #Check the products description section on the page to make sure it is FSC Certified
        check_for_fsc_cert = response.xpath(
            '//div[@class="row multi-column-row"]/div[contains(.,"FSC")]'
        ).extract()
        if check_for_fsc_cert == []:
            return

        product_name = response.xpath(
            '//meta[@property="og:title"]/@content').extract_first()
        product_url = response.xpath(
            '//meta[@property="og:url"]/@content').extract_first()
        image = response.xpath(
            '//meta[@property="og:image"]/@content').extract_first()
        description = response.xpath(
            '//meta[@property="og:description"]/@content').extract_first()
        string_of_price = response.xpath(
            '//meta[@property="og:price:amount"]/@content').extract_first()

        def modifyPrice(thePrice):
            price_container = []
            price_without_commas = thePrice.replace(',', '')
            price_as_number = float(price_without_commas)
            price_container.append(price_as_number)
            return price_container

        final_price = modifyPrice(string_of_price)
        #Most of the eco-products on WestElm are FSC Certified
        #Currently no Greenguard Products have been noted for this store
        certifications = [{
            "certification":
            "Forest Stewardship (FSC) Certified",
            "title":
            "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests.",
            "url":
            "https://us.fsc.org/en-us/what-we-do/mission-and-vision"
        }]
        #Currently no price span for any products on Stem Goods
        lowest_price = final_price[0]

        load_item = productSpiderItem()

        load_item['sitename'] = 'Stem'
        load_item['productname'] = product_name
        load_item['producturl'] = product_url
        load_item['image'] = image
        load_item['price'] = final_price
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = lowest_price
        #Returns object with each load_item included
        yield load_item

Пример #4

0

Показать файл

Файл: a2modern.py Проект: r-wirt/eco-furniture-crawlers

    def parse_product(self, response):
        product_name = response.xpath(
            '//meta[@property="og:title"]/@content').extract_first()
        product_url = response.xpath(
            '//meta[@property="og:url"]/@content').extract_first()
        image = 'https:' + response.xpath(
            '//meta[@property="og:image"]/@content').extract_first().replace(
                'medium', 'large')
        description = response.xpath(
            '//meta[@property="og:description"]/@content').extract_first()
        #The following line will return the price(s) as a string inside of a list, and it may also have commas.
        original_price_span = response.xpath(
            '//meta[@property="og:price:amount"]/@content').extract()

        def modifyPriceList(thePriceList):
            price_container = []
            #Takes commas out of price
            for price in thePriceList:
                price_without_commas = price.replace(',', '')
                price_container.append(price_without_commas)
            #Convert string in list to floating number
            price_container = list(map(float, price_container))
            return price_container

        final_price_span = modifyPriceList(original_price_span)
        #Most of the eco-products on 2modern are FSC Certified
        #Currently no Greenguard Products have been noted for this store
        certifications = [{
            "certification":
            "Forest Stewardship (FSC) Certified",
            "title":
            "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests.",
            "url":
            "https://us.fsc.org/en-us/what-we-do/mission-and-vision"
        }]
        #In order for elasticsearch to sort the products by price amount it will need the first
        #one in the array if there is a price range for the product
        lowest_price = final_price_span[0]

        load_item = productSpiderItem()

        load_item['sitename'] = '2Modern'
        load_item['productname'] = product_name
        load_item['producturl'] = product_url
        load_item['image'] = image
        load_item['price'] = final_price_span
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = lowest_price
        #Returns object with each load_item included
        yield load_item

Пример #5

0

Показать файл

Файл: rominafurniture.py Проект: r-wirt/eco-furniture-crawlers

    def parse_product(self, response):
        #Checking product description for Greenguard Gold, if no Greenguard Gold is
        #detected the function should be cancelled since the product may not be certified
        check_description = response.xpath('//div[@class="std"]//ul/li').extract()
        def check_for_certification(description):
            if any("GREENGUARD Gold" in string for string in description):
                pass
            else:
                return

        productname = response.xpath('//h1[@itemprop="name"]/text()').extract_first()
        product_url = response.xpath('//meta[@itemprop="url"]/@content').extract_first()
        image = response.xpath('//ul[@id="imageGallery"]/li//img/@src').extract_first()
        #.replace() Removes excess unicode characters
        description = response.xpath('//div[@class="std"]/p/text()').extract_first().replace(u'\xa0', u' ')
        #The price for the product changes dynamically at the last minute, this price may be off
        original_price = response.xpath('//span[@class="amount"]/text()').extract_first()

        #The prices will be returned as a string, and they may also have commas in them,
        #This function is intended to remove any commas and turn the strings into integers
        def modifyPriceList (thePrice):
            price_container = []
            #Takes commas and dollar signs out of price
            price_without_commas = thePrice.replace(',','').replace('$','')
            price_container.append(price_without_commas)
            #Changes each number in price container to a floating number
            price_container = list(map(float, price_container))
            return price_container

        final_price = modifyPriceList(original_price)
        certifications = [{
"certification": "GREENGUARD Certified",
"title": "GREENGUARD Certified products contain materials and finishes that have been verified to have low chemical emissions. \
 As a result, this product improves overall indoor air quality by reducing the presence of harmful \
 pollutants and airborne chemicals.", "url": "http://greenguard.org/en/CertificationPrograms/CertificationPrograms_indoorAirQuality.aspx"
}]


        load_item = productSpiderItem()

        load_item['sitename'] = 'Romina Furniture'
        load_item['productname'] = productname
        load_item['producturl'] = product_url
        load_item['image'] = image
        load_item['price'] = final_price
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = final_price
        #Returns object with each load_item included
        yield load_item

Пример #6

0

Показать файл

    def parse_product(self, response):
        #Checking  for Greenguard Gold, if no Greenguard Cert is
        #detected close method
        greenguard_verified = re.search(r'<a\s*href="[^"]*?"\s*title="[^"]*?GREENGUARD[^"]*?"', response.text, flags=re.I |re.S)
        if not greenguard_verified:
            print("No Greenguard Products Detected")
            return

        productname = re.search(r'itemprop="name"\s*content="([^"]*?)">', response.text, flags=re.I | re.S).group(1)
        product_url = response.xpath('//meta[@itemprop="url"]/@content').extract_first()
        image_url_stub = re.search(r'data-src="([^"]*?)"[^>]*?id="ProductPhotoImg">', response.text, flags=re.I |re.S).group(1)
        image =  'https://rominafurniture.com' + image_url_stub
        #.replace() Removes unicode characters
        # description = response.xpath('//div[@class="std"]/p/text()').extract_first().replace(u'\xa0', u' ')
        #The price for the product changes dynamically at the last minute, this price may be off
        original_price = re.search(r'<meta property="og:price:amount"\s*content="([^"]*?)">', response.text, flags=re.I | re.S)

        #The prices will be returned as a string, and they may also have commas in them,
        #This function is intended to remove any commas and turn the strings into integers
        def modifyPriceList (thePrice):
            price_container = []
            #Takes commas and dollar signs out of price
            price_without_commas = thePrice.replace(',','').replace('$','')
            price_container.append(price_without_commas)
            #Changes each number in price container to a floating number
            price_container = list(map(float, price_container))
            return price_container

        final_price = modifyPriceList(original_price.group(1))
        certifications = [{
"certification": "GREENGUARD Certified",
"title": "GREENGUARD Certified products contain materials and finishes that have been verified to have low chemical emissions. \
 As a result, this product improves overall indoor air quality by reducing the presence of harmful \
 pollutants and airborne chemicals.", "url": "http://greenguard.org/en/CertificationPrograms/CertificationPrograms_indoorAirQuality.aspx"
}]


        load_item = productSpiderItem()

        load_item['sitename'] = 'Romina Furniture'
        load_item['productname'] = productname
        load_item['producturl'] = product_url
        load_item['image'] = image
        load_item['price'] = final_price
        load_item['certifications'] = certifications
        # load_item['description'] = description
        load_item['lowestprice'] = final_price
        #Returns object with each load_item included
        yield load_item

Пример #7

0

Показать файл

Файл: smartfurniture.py Проект: r-wirt/eco-furniture-crawlers

    def parse_product(self, response):
        productname = response.xpath('//title/text()').extract_first()
        product_url = 'https://smartfurniture.com' + response.xpath(
            '//body/div/@data-url').extract_first()
        #Returns a list, image will be 8th in the list(On zero-based index)
        image = re.search(
            r'<a\s*href="//([^"]*?)">\s*<div\s*class="square-image',
            response.text,
            flags=re.I | re.S)
        description = response.xpath(
            '//meta[@name="description"]/@content').extract_first()
        pricing = re.search(
            r'<div class="product-price-details\s*"[^>]*?data-sale-price="([^"]*?)"\s*data-original-price="([^"]*?)"',
            response.text,
            flags=re.I | re.S)
        lowest_price = pricing.group(1)
        price = pricing.group(2)
        check_greenguard = response.xpath(
            '//*[text()[contains(.,"GREENGUARD")]]')
        check_fsc = response.xpath('//*[text()[contains(.,"FSC")]]')

        #Both FSC and Greenguard products
        def find_certifications():
            cert_container = []
            if check_greenguard and check_fsc:
                cert_container.extend(
                    ("GREENGUARD Certified",
                     "Forest Stewardship Council (FSC) Certified"))
            elif check_greenguard:
                cert_container.append('GREENGUARD Certified')
            elif check_fsc:
                cert_container.append(
                    'Forest Stewardship Council (FSC) Certified')
            return cert_container

        certifications = find_certifications()

        load_item = productSpiderItem()

        load_item['sitename'] = 'Smart Furniture'
        load_item['productname'] = productname
        load_item['producturl'] = product_url
        load_item['image'] = image.group(1)
        load_item['price'] = price
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = lowest_price
        #Returns object with each load_item included
        yield load_item

Пример #8

0

Показать файл

Файл: smartfurniture.py Проект: r-wirt/eco-furniture-crawlers

    def parse_product(self, response):
        productname = response.xpath('//title/text()').extract_first()
        product_url = 'https://smartfurniture.com' + response.xpath(
            '//body/div/@data-url').extract_first()
        #Returns a list, image will be 8th in the list(On zero-based index)
        image = 'https://smartfurniture.com' + response.xpath(
            '//div[@class="mainContent"]//a/@href')
        description = response.xpath(
            '//meta[@name="description"]/@content').extract_first()
        price = response.xpath(
            '//meta[@itemprop="price"]/@content').extract_first()
        lowest_price = response.xpath(
            '//meta[@itemprop="price"]/@content').extract_first()
        check_greenguard = response.xpath(
            '//*[text()[contains(.,"GREENGUARD")]]')
        check_fsc = response.xpath('//*[text()[contains(.,"FSC")]]')

        #Both FSC and Greenguard products
        def find_certifications():
            cert_container = []
            if check_greenguard[0] and check_fsc[0]:
                cert_container.extend(
                    ("GREENGUARD Certified",
                     "Forest Stewardship Council (FSC) Certified"))
            elif check_greenguard[0]:
                cert_container.append('GREENGUARD Certified')
            elif check_fsc[0]:
                cert_container.append(
                    'Forest Stewardship Council (FSC) Certified')
            return cert_container

        ceritifications = find_certifications()

        load_item = productSpiderItem()

        load_item['sitename'] = 'Smart Furniture'
        load_item['productname'] = productname
        load_item['producturl'] = product_url
        load_item['image'] = image[8]
        load_item['price'] = price
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = lowest_price
        #Returns object with each load_item included
        yield load_item

Пример #9

0

Показать файл

Файл: davincibaby.py Проект: r-wirt/eco-furniture-crawlers

    def parse_product(self, response):
        #Check the products description section on the page to make sure it is Greenguard Certified
        check_for_gg_cert = response.xpath('//div[@itemprop="description"]//ul').extract_first()
        if 'GREENGUARD' not in check_for_gg_cert:
            return
        product_name = response.xpath('//meta[@property="og:title"]/@content').extract_first()
        product_url = response.xpath('//meta[@property="og:url"]/@content').extract_first()
        image = response.xpath('//meta[@property="og:image"]/@content').extract_first()
        description = response.xpath('//meta[@property="og:description"]/@content').extract_first()
        original_price = response.xpath('//meta[@property="og:price:amount"]/@content').extract_first()

        #The prices will be returned as a string
        #This function is intended to turn the original_price string into an integer
        def modifyPriceList (thePrice):
            price_container = []
            #Turns price_without_commas into floating integer
            price_as_integer = float(thePrice)
            price_container.append(price_as_integer)
            return price_container

        final_price = modifyPriceList(original_price)

        #Most of the eco-products on davincibaby are GREENGUARD Gold Certified
        certifications = [{
"certification": "GREENGUARD Certified",
"title": "GREENGUARD Certified products contain materials and finishes that have been verified to have low chemical emissions. \
 As a result, this product improves overall indoor air quality by reducing the presence of harmful \
 pollutants and airborne chemicals.", "url": "http://greenguard.org/en/CertificationPrograms/CertificationPrograms_indoorAirQuality.aspx"
}]

        load_item = productSpiderItem()

        load_item['sitename'] = 'Davinci Baby'
        load_item['productname'] = product_name
        load_item['producturl'] = product_url
        load_item['image'] = image
        load_item['price'] = final_price
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = final_price[0]
        #Returns object with each load_item included
        yield load_item

Пример #10

0

Показать файл

    def parse_product(self, response):

        productname = response.xpath(
            '//div[@class="pip-summary"]/h1/text()').extract_first()
        product_url = response.xpath(
            '//head/meta[@property="og:url"]/@content').extract_first()
        image = response.xpath(
            '//head/meta[@property="og:image"]/@content').extract_first()
        #Description response will have unicode characters in scrapy shell but they are parsed into utf-8
        #when it is sent via ElasticSearch pipeline
        description = response.xpath(
            '//head/meta[@property="twitter:description"]/@content'
        ).extract_first()
        #The following line will return the price(s) as a string in a list, and it may also have commas.
        original_price_span = response.xpath(
            '//section[@class="simple-subset"]//span[@class="price-amount"]/text()'
        ).extract()

        #Some Pottery Barn Kids products list a price range for different sizes
        #Products with a price range must be grabbed from a different section on product page
        #This function is intended to remove any commas and turn the strings into integers
        #The function also grabs the low and high price if original_price_span is empty
        def modifyPriceList(thePriceSpan):

            if thePriceSpan == []:
                price_container = []
                #Grabs the object as a string from script element on page
                #Object contains both high price and low price
                load_string_with_pricing = response.xpath(
                    '//script[@type="application/ld+json"]/text()'
                ).extract_first()
                #Converts string element to dictionary
                convert_string_to_dictionary = ast.literal_eval(
                    load_string_with_pricing)
                #Grab highest and lowest price
                low_price = convert_string_to_dictionary['offers']['lowPrice']
                high_price = convert_string_to_dictionary['offers'][
                    'highPrice']
                #Add low price and high price to price_conteinr list
                price_container.extend([low_price, high_price])
                #Convert low price and high price to floating numbers
                price_container = list(map(float, price_container))
                return price_container

            #Removes original price range, and extracts the new marked down price range
            if len(thePriceSpan) == 4:
                del thePriceSpan[:2]
            price_container = []
            for price in thePriceSpan:
                #Takes commas out of price
                price_without_commas = price.replace(',', '')
                price_container.append(price_without_commas)
                #Changes each number in price container to a floating number
                price_container = list(map(float, price_container))
            return price_container

        final_price_span = modifyPriceList(original_price_span)

        #Most of the eco-products on Pottery Barn Kids products are Greenguard Gold Certified
        #Currently no FSC Products have been noted for this store
        certifications = [{
            "certification":
            "GREENGUARD Certified",
            "title":
            "GREENGUARD Certified products contain materials and finishes that have been verified to have low chemical emissions. \
 As a result, this product improves overall indoor air quality by reducing the presence of harmful \
 pollutants and airborne chemicals.",
            "url":
            "http://greenguard.org/en/CertificationPrograms/CertificationPrograms_indoorAirQuality.aspx"
        }]
        #In order for elasticsearch to sort the products by price amount it will need the first
        #one in the array if there is a price range for the product
        lowest_price = final_price_span[0]

        load_item = productSpiderItem()

        load_item['sitename'] = 'Pottery Barn Kids'
        load_item['productname'] = productname
        load_item['producturl'] = product_url
        load_item['image'] = image
        load_item['price'] = final_price_span
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = lowest_price
        #Returns object with each load_item included
        yield load_item

Пример #11

0

Показать файл

Файл: potterybarn.py Проект: r-wirt/eco-furniture-crawlers

    def parse_product(self, response):
        product_name = "Outdoor: " + response.xpath('//head/meta[@property="og:title"]/@content').extract_first()
        #Remove registered trade mark from string since it arrives as numeric
        #representation of the non-ascii character
        if "&#0174;" in product_name:
            product_name_without_symbols = product_name.replace("&#0174;", "")
            product_name = product_name_without_symbols
        product_url = response.xpath('//head/meta[@property="og:url"]/@content').extract_first()
        image = response.xpath('//head/meta[@property="og:image"]/@content').extract_first()
        #Description response will have unicode characters in scrapy shell but they are parsed into utf-8
        #when it is sent via ElasticSearch pipeline
        description = response.xpath('//head/meta[@property="twitter:description"]/@content').extract_first()
        #The following line will return the price(s) as a string in a list, and it may also have commas.
        original_price_span = response.xpath('//section[@class="simple-subset"]//span[@class="price-amount"]/text()').extract()

        #Some Pottery Barn products list a price range for different sizes
        #Products with a price range must be grabbed from a different section on product page
        #This function is intended to remove any commas and turn the strings into integers
        #The function also grabs the low and high price if original_price_span is empty
        def modifyPriceList (thePriceSpan):

            #If the following condition is true
            #It indicates that a price range is in place for the product
            if thePriceSpan == []:
                price_container = []
                #Grabs the object as a string from script element on page
                #Object contains both high price and low price
                load_string_with_pricing = response.xpath('//script[@type="application/ld+json"]/text()').extract_first()
                #Converts string element to dictionary
                convert_string_to_dictionary = ast.literal_eval(load_string_with_pricing)
                #Grab highest and lowest price
                low_price = convert_string_to_dictionary['offers']['lowPrice']
                high_price = convert_string_to_dictionary['offers']['highPrice']
                #Add low price and high price to price_conteinr list
                price_container.extend([low_price, high_price])
                #Convert low price and high price to floating numbers
                price_container = list(map(float, price_container))
                return price_container

            #Whenever there are two numbers in the list
            #The first one has to be deleted because it is not
            #the current marked down price
            if len(thePriceSpan) == 2:
                del thePriceSpan[:1]

            #Removes original price range, and extracts the new marked down price range
            if len(thePriceSpan) == 4:
                del thePriceSpan[:2]

            price_container = []
            for price in thePriceSpan:
                #Takes commas out of price
                price_without_commas = price.replace(',','')
                price_container.append(price_without_commas)
            #Changes each number in price container to a floating number
            price_container = list(map(float, price_container))
            return price_container

        final_price_span = modifyPriceList(original_price_span)
        #Most of the eco-products on 2modern are FSC Certified
        #Currently no Greenguard Products have been noted for this store
        certifications = [{"certification": "Forest Stewardship (FSC) Certified",
                          "title": "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests.",
                          "url": "https://us.fsc.org/en-us/what-we-do/mission-and-vision"}]
        #In order for elasticsearch to sort the products by price amount it will need the first
        #one in the array if there is a price range for the product
        lowest_price = final_price_span[0]

        load_item = productSpiderItem()

        load_item['sitename'] = 'Pottery Barn'
        load_item['productname'] = product_name
        load_item['producturl'] = product_url
        load_item['image'] = image
        load_item['price'] = final_price_span
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = lowest_price
        #Returns object with each load_item included
        yield load_item

Пример #12

0

Показать файл

    def parse_product(self, response):
        #Check the products description section on the page to make sure it is FSC Certified
        check_for_fsc_cert = response.xpath(
            '//div[@class="accordion-tab-copy"]//ul/li[text()[contains(.,"FSC")]]'
        ).extract()
        if check_for_fsc_cert == []:
            return
        product_name = response.xpath(
            '//meta[@property="og:title"]/@content').extract_first()
        product_url = response.xpath(
            '//meta[@property="og:url"]/@content').extract_first()
        image = response.xpath(
            '//meta[@property="og:image"]/@content').extract_first()
        description = response.xpath(
            '//meta[@name="description"]/@content').extract_first()
        original_price = response.xpath(
            '//div[@class="pip-summary"]//span[@class="price-amount"]/text()'
        ).extract()

        def modifyPriceList(thePriceSpan):
            #If the following condition is true
            #It indicates that a price range is in place for the product
            if thePriceSpan == []:

                price_container = []
                #Grabs the object as a string from script element on page
                #Object contains both high price and low price
                load_string_with_pricing = response.xpath(
                    '//script[@type="application/ld+json"]/text()'
                ).extract_first()
                #Converts string element to dictionary
                convert_string_to_dictionary = ast.literal_eval(
                    load_string_with_pricing)
                #Grab highest and lowest price
                low_price = convert_string_to_dictionary['offers']['lowPrice']
                high_price = convert_string_to_dictionary['offers'][
                    'highPrice']
                #Add low price and high price to price_conteinr list
                price_container.extend([low_price, high_price])
                #Convert low price and high price to floating numbers
                price_container = list(map(float, price_container))
                return price_container

            #Whenever there are two numbers in the list
            #The first one has to be deleted because it is not
            #the current marked down price
            if len(thePriceSpan) == 2:
                del thePriceSpan[:1]

            #Removes original price range, and extracts the new marked down price range
            if len(thePriceSpan) == 4:
                del thePriceSpan[:2]

            price_container = []
            for price in thePriceSpan:
                #Takes commas out of price
                price_without_commas = price.replace(',', '')
                price_container.append(price_without_commas)
            #Changes each number in price container to a floating number
            price_container = list(map(float, price_container))
            return price_container

        final_price = modifyPriceList(original_price)

        #Most of the eco-products on davincibaby are GREENGUARD Gold Certified
        certifications = [{
            "certification":
            "Forest Stewardship (FSC) Certified",
            "title":
            "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests. For more information click here",
            "url":
            "https://us.fsc.org/en-us/what-we-do/mission-and-vision"
        }]

        load_item = productSpiderItem()

        load_item['sitename'] = 'Williams Sonoma'
        load_item['productname'] = product_name
        load_item['producturl'] = product_url
        load_item['image'] = image
        load_item['price'] = final_price
        load_item['certifications'] = certifications
        load_item['description'] = description
        load_item['lowestprice'] = final_price[0]
        #Returns object with each load_item included
        yield load_item

Python productSpiderItem примеры использования