def parse_product(self, response): #Check the products description section on the page to make sure it is FSC Certified check_for_fsc_cert = response.xpath( '//p[text()[contains(.,"FSC")]]').extract() if check_for_fsc_cert == []: return product_name = response.xpath( '//div[@class="ml-product-name"]/div/text()').extract_first() product_url = response.xpath( '//link[@rel="alternate"]/@href').extract_first() image = response.xpath( '//div[@class="ml-product-alt-detail-image"]//img/@src' ).extract_first() #Some product pages render the product images dynamically, after scrapy has rendered page #This following if-statement is used to move on to next product if image was not rendered in page if image == None: return description = response.xpath( '//meta[@name="description"]/@content').extract_first() original_price = response.xpath( '//span[@class="ml-item-price"]/text()').extract_first() #The prices will be returned as a string, and they may also have commas and '$' in them, #This function is intended to remove any commas and '$' then turn the strings into integers def modifyPriceList(thePrice): price_container = [] #Takes dollar sign and commas out of price price_without_commas = thePrice.replace(',', '').replace('$', '') #Turns price_without_commas into floating integer price_as_integer = float(price_without_commas) price_container.append(price_as_integer) return price_container final_price = modifyPriceList(original_price) #Most of the eco-products on France and Son are FSC Certified #Currently no Greenguard Products have been noted for this store certifications = [{ "certification": "Forest Stewardship (FSC) Certified", "title": "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests.", "url": "https://us.fsc.org/en-us/what-we-do/mission-and-vision" }] load_item = productSpiderItem() load_item['sitename'] = 'One Kings Lane' load_item['productname'] = product_name load_item['producturl'] = product_url load_item['image'] = image load_item['price'] = final_price load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = final_price[0] #Returns object with each load_item included yield load_item
def parse_product(self, response): #Check the product page to see if it includes the sold out tag sold_out = response.xpath( '//span[@class="product--badge badge--soldout"]/text()').extract() #If the page does contain sold out tag end the parse_product function if sold_out == []: return product_name = response.xpath( '//meta[@property="og:title"]/@content').extract_first() product_url = response.xpath( '//meta[@property="og:url"]/@content').extract_first() image = response.xpath( '//meta[@property="og:image"]/@content').extract_first() description = response.xpath( '//meta[@property="og:description"]/@content').extract_first() original_price = response.xpath( '//meta[@property="og:price:amount"]/@content').extract_first() #The prices will be returned as a string, and they may also have commas in them, #This function is intended to remove any commas and turn the strings into integers def modifyPriceList(thePrice): price_container = [] #Takes commas out of price price_without_commas = thePrice.replace(',', '') #Turns price_without_commas into floating integer price_as_integer = float(price_without_commas) price_container.append(price_as_integer) return price_container final_price = modifyPriceList(original_price) #Most of the eco-products on France and Son are FSC Certified #Currently no Greenguard Products have been noted for this store certifications = [{ "certification": "Forest Stewardship (FSC) Certified", "title": "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests.", "url": "https://us.fsc.org/en-us/what-we-do/mission-and-vision" }] load_item = productSpiderItem() load_item['sitename'] = 'France and Son' load_item['productname'] = product_name load_item['producturl'] = product_url load_item['image'] = image load_item['price'] = final_price load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = final_price[0] #Returns object with each load_item included yield load_item
def parse_product(self, response): #Check the products description section on the page to make sure it is FSC Certified check_for_fsc_cert = response.xpath( '//div[@class="row multi-column-row"]/div[contains(.,"FSC")]' ).extract() if check_for_fsc_cert == []: return product_name = response.xpath( '//meta[@property="og:title"]/@content').extract_first() product_url = response.xpath( '//meta[@property="og:url"]/@content').extract_first() image = response.xpath( '//meta[@property="og:image"]/@content').extract_first() description = response.xpath( '//meta[@property="og:description"]/@content').extract_first() string_of_price = response.xpath( '//meta[@property="og:price:amount"]/@content').extract_first() def modifyPrice(thePrice): price_container = [] price_without_commas = thePrice.replace(',', '') price_as_number = float(price_without_commas) price_container.append(price_as_number) return price_container final_price = modifyPrice(string_of_price) #Most of the eco-products on WestElm are FSC Certified #Currently no Greenguard Products have been noted for this store certifications = [{ "certification": "Forest Stewardship (FSC) Certified", "title": "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests.", "url": "https://us.fsc.org/en-us/what-we-do/mission-and-vision" }] #Currently no price span for any products on Stem Goods lowest_price = final_price[0] load_item = productSpiderItem() load_item['sitename'] = 'Stem' load_item['productname'] = product_name load_item['producturl'] = product_url load_item['image'] = image load_item['price'] = final_price load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = lowest_price #Returns object with each load_item included yield load_item
def parse_product(self, response): product_name = response.xpath( '//meta[@property="og:title"]/@content').extract_first() product_url = response.xpath( '//meta[@property="og:url"]/@content').extract_first() image = 'https:' + response.xpath( '//meta[@property="og:image"]/@content').extract_first().replace( 'medium', 'large') description = response.xpath( '//meta[@property="og:description"]/@content').extract_first() #The following line will return the price(s) as a string inside of a list, and it may also have commas. original_price_span = response.xpath( '//meta[@property="og:price:amount"]/@content').extract() def modifyPriceList(thePriceList): price_container = [] #Takes commas out of price for price in thePriceList: price_without_commas = price.replace(',', '') price_container.append(price_without_commas) #Convert string in list to floating number price_container = list(map(float, price_container)) return price_container final_price_span = modifyPriceList(original_price_span) #Most of the eco-products on 2modern are FSC Certified #Currently no Greenguard Products have been noted for this store certifications = [{ "certification": "Forest Stewardship (FSC) Certified", "title": "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests.", "url": "https://us.fsc.org/en-us/what-we-do/mission-and-vision" }] #In order for elasticsearch to sort the products by price amount it will need the first #one in the array if there is a price range for the product lowest_price = final_price_span[0] load_item = productSpiderItem() load_item['sitename'] = '2Modern' load_item['productname'] = product_name load_item['producturl'] = product_url load_item['image'] = image load_item['price'] = final_price_span load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = lowest_price #Returns object with each load_item included yield load_item
def parse_product(self, response): #Checking product description for Greenguard Gold, if no Greenguard Gold is #detected the function should be cancelled since the product may not be certified check_description = response.xpath('//div[@class="std"]//ul/li').extract() def check_for_certification(description): if any("GREENGUARD Gold" in string for string in description): pass else: return productname = response.xpath('//h1[@itemprop="name"]/text()').extract_first() product_url = response.xpath('//meta[@itemprop="url"]/@content').extract_first() image = response.xpath('//ul[@id="imageGallery"]/li//img/@src').extract_first() #.replace() Removes excess unicode characters description = response.xpath('//div[@class="std"]/p/text()').extract_first().replace(u'\xa0', u' ') #The price for the product changes dynamically at the last minute, this price may be off original_price = response.xpath('//span[@class="amount"]/text()').extract_first() #The prices will be returned as a string, and they may also have commas in them, #This function is intended to remove any commas and turn the strings into integers def modifyPriceList (thePrice): price_container = [] #Takes commas and dollar signs out of price price_without_commas = thePrice.replace(',','').replace('$','') price_container.append(price_without_commas) #Changes each number in price container to a floating number price_container = list(map(float, price_container)) return price_container final_price = modifyPriceList(original_price) certifications = [{ "certification": "GREENGUARD Certified", "title": "GREENGUARD Certified products contain materials and finishes that have been verified to have low chemical emissions. \ As a result, this product improves overall indoor air quality by reducing the presence of harmful \ pollutants and airborne chemicals.", "url": "http://greenguard.org/en/CertificationPrograms/CertificationPrograms_indoorAirQuality.aspx" }] load_item = productSpiderItem() load_item['sitename'] = 'Romina Furniture' load_item['productname'] = productname load_item['producturl'] = product_url load_item['image'] = image load_item['price'] = final_price load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = final_price #Returns object with each load_item included yield load_item
def parse_product(self, response): #Checking for Greenguard Gold, if no Greenguard Cert is #detected close method greenguard_verified = re.search(r'<a\s*href="[^"]*?"\s*title="[^"]*?GREENGUARD[^"]*?"', response.text, flags=re.I |re.S) if not greenguard_verified: print("No Greenguard Products Detected") return productname = re.search(r'itemprop="name"\s*content="([^"]*?)">', response.text, flags=re.I | re.S).group(1) product_url = response.xpath('//meta[@itemprop="url"]/@content').extract_first() image_url_stub = re.search(r'data-src="([^"]*?)"[^>]*?id="ProductPhotoImg">', response.text, flags=re.I |re.S).group(1) image = 'https://rominafurniture.com' + image_url_stub #.replace() Removes unicode characters # description = response.xpath('//div[@class="std"]/p/text()').extract_first().replace(u'\xa0', u' ') #The price for the product changes dynamically at the last minute, this price may be off original_price = re.search(r'<meta property="og:price:amount"\s*content="([^"]*?)">', response.text, flags=re.I | re.S) #The prices will be returned as a string, and they may also have commas in them, #This function is intended to remove any commas and turn the strings into integers def modifyPriceList (thePrice): price_container = [] #Takes commas and dollar signs out of price price_without_commas = thePrice.replace(',','').replace('$','') price_container.append(price_without_commas) #Changes each number in price container to a floating number price_container = list(map(float, price_container)) return price_container final_price = modifyPriceList(original_price.group(1)) certifications = [{ "certification": "GREENGUARD Certified", "title": "GREENGUARD Certified products contain materials and finishes that have been verified to have low chemical emissions. \ As a result, this product improves overall indoor air quality by reducing the presence of harmful \ pollutants and airborne chemicals.", "url": "http://greenguard.org/en/CertificationPrograms/CertificationPrograms_indoorAirQuality.aspx" }] load_item = productSpiderItem() load_item['sitename'] = 'Romina Furniture' load_item['productname'] = productname load_item['producturl'] = product_url load_item['image'] = image load_item['price'] = final_price load_item['certifications'] = certifications # load_item['description'] = description load_item['lowestprice'] = final_price #Returns object with each load_item included yield load_item
def parse_product(self, response): productname = response.xpath('//title/text()').extract_first() product_url = 'https://smartfurniture.com' + response.xpath( '//body/div/@data-url').extract_first() #Returns a list, image will be 8th in the list(On zero-based index) image = re.search( r'<a\s*href="//([^"]*?)">\s*<div\s*class="square-image', response.text, flags=re.I | re.S) description = response.xpath( '//meta[@name="description"]/@content').extract_first() pricing = re.search( r'<div class="product-price-details\s*"[^>]*?data-sale-price="([^"]*?)"\s*data-original-price="([^"]*?)"', response.text, flags=re.I | re.S) lowest_price = pricing.group(1) price = pricing.group(2) check_greenguard = response.xpath( '//*[text()[contains(.,"GREENGUARD")]]') check_fsc = response.xpath('//*[text()[contains(.,"FSC")]]') #Both FSC and Greenguard products def find_certifications(): cert_container = [] if check_greenguard and check_fsc: cert_container.extend( ("GREENGUARD Certified", "Forest Stewardship Council (FSC) Certified")) elif check_greenguard: cert_container.append('GREENGUARD Certified') elif check_fsc: cert_container.append( 'Forest Stewardship Council (FSC) Certified') return cert_container certifications = find_certifications() load_item = productSpiderItem() load_item['sitename'] = 'Smart Furniture' load_item['productname'] = productname load_item['producturl'] = product_url load_item['image'] = image.group(1) load_item['price'] = price load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = lowest_price #Returns object with each load_item included yield load_item
def parse_product(self, response): productname = response.xpath('//title/text()').extract_first() product_url = 'https://smartfurniture.com' + response.xpath( '//body/div/@data-url').extract_first() #Returns a list, image will be 8th in the list(On zero-based index) image = 'https://smartfurniture.com' + response.xpath( '//div[@class="mainContent"]//a/@href') description = response.xpath( '//meta[@name="description"]/@content').extract_first() price = response.xpath( '//meta[@itemprop="price"]/@content').extract_first() lowest_price = response.xpath( '//meta[@itemprop="price"]/@content').extract_first() check_greenguard = response.xpath( '//*[text()[contains(.,"GREENGUARD")]]') check_fsc = response.xpath('//*[text()[contains(.,"FSC")]]') #Both FSC and Greenguard products def find_certifications(): cert_container = [] if check_greenguard[0] and check_fsc[0]: cert_container.extend( ("GREENGUARD Certified", "Forest Stewardship Council (FSC) Certified")) elif check_greenguard[0]: cert_container.append('GREENGUARD Certified') elif check_fsc[0]: cert_container.append( 'Forest Stewardship Council (FSC) Certified') return cert_container ceritifications = find_certifications() load_item = productSpiderItem() load_item['sitename'] = 'Smart Furniture' load_item['productname'] = productname load_item['producturl'] = product_url load_item['image'] = image[8] load_item['price'] = price load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = lowest_price #Returns object with each load_item included yield load_item
def parse_product(self, response): #Check the products description section on the page to make sure it is Greenguard Certified check_for_gg_cert = response.xpath('//div[@itemprop="description"]//ul').extract_first() if 'GREENGUARD' not in check_for_gg_cert: return product_name = response.xpath('//meta[@property="og:title"]/@content').extract_first() product_url = response.xpath('//meta[@property="og:url"]/@content').extract_first() image = response.xpath('//meta[@property="og:image"]/@content').extract_first() description = response.xpath('//meta[@property="og:description"]/@content').extract_first() original_price = response.xpath('//meta[@property="og:price:amount"]/@content').extract_first() #The prices will be returned as a string #This function is intended to turn the original_price string into an integer def modifyPriceList (thePrice): price_container = [] #Turns price_without_commas into floating integer price_as_integer = float(thePrice) price_container.append(price_as_integer) return price_container final_price = modifyPriceList(original_price) #Most of the eco-products on davincibaby are GREENGUARD Gold Certified certifications = [{ "certification": "GREENGUARD Certified", "title": "GREENGUARD Certified products contain materials and finishes that have been verified to have low chemical emissions. \ As a result, this product improves overall indoor air quality by reducing the presence of harmful \ pollutants and airborne chemicals.", "url": "http://greenguard.org/en/CertificationPrograms/CertificationPrograms_indoorAirQuality.aspx" }] load_item = productSpiderItem() load_item['sitename'] = 'Davinci Baby' load_item['productname'] = product_name load_item['producturl'] = product_url load_item['image'] = image load_item['price'] = final_price load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = final_price[0] #Returns object with each load_item included yield load_item
def parse_product(self, response): productname = response.xpath( '//div[@class="pip-summary"]/h1/text()').extract_first() product_url = response.xpath( '//head/meta[@property="og:url"]/@content').extract_first() image = response.xpath( '//head/meta[@property="og:image"]/@content').extract_first() #Description response will have unicode characters in scrapy shell but they are parsed into utf-8 #when it is sent via ElasticSearch pipeline description = response.xpath( '//head/meta[@property="twitter:description"]/@content' ).extract_first() #The following line will return the price(s) as a string in a list, and it may also have commas. original_price_span = response.xpath( '//section[@class="simple-subset"]//span[@class="price-amount"]/text()' ).extract() #Some Pottery Barn Kids products list a price range for different sizes #Products with a price range must be grabbed from a different section on product page #This function is intended to remove any commas and turn the strings into integers #The function also grabs the low and high price if original_price_span is empty def modifyPriceList(thePriceSpan): if thePriceSpan == []: price_container = [] #Grabs the object as a string from script element on page #Object contains both high price and low price load_string_with_pricing = response.xpath( '//script[@type="application/ld+json"]/text()' ).extract_first() #Converts string element to dictionary convert_string_to_dictionary = ast.literal_eval( load_string_with_pricing) #Grab highest and lowest price low_price = convert_string_to_dictionary['offers']['lowPrice'] high_price = convert_string_to_dictionary['offers'][ 'highPrice'] #Add low price and high price to price_conteinr list price_container.extend([low_price, high_price]) #Convert low price and high price to floating numbers price_container = list(map(float, price_container)) return price_container #Removes original price range, and extracts the new marked down price range if len(thePriceSpan) == 4: del thePriceSpan[:2] price_container = [] for price in thePriceSpan: #Takes commas out of price price_without_commas = price.replace(',', '') price_container.append(price_without_commas) #Changes each number in price container to a floating number price_container = list(map(float, price_container)) return price_container final_price_span = modifyPriceList(original_price_span) #Most of the eco-products on Pottery Barn Kids products are Greenguard Gold Certified #Currently no FSC Products have been noted for this store certifications = [{ "certification": "GREENGUARD Certified", "title": "GREENGUARD Certified products contain materials and finishes that have been verified to have low chemical emissions. \ As a result, this product improves overall indoor air quality by reducing the presence of harmful \ pollutants and airborne chemicals.", "url": "http://greenguard.org/en/CertificationPrograms/CertificationPrograms_indoorAirQuality.aspx" }] #In order for elasticsearch to sort the products by price amount it will need the first #one in the array if there is a price range for the product lowest_price = final_price_span[0] load_item = productSpiderItem() load_item['sitename'] = 'Pottery Barn Kids' load_item['productname'] = productname load_item['producturl'] = product_url load_item['image'] = image load_item['price'] = final_price_span load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = lowest_price #Returns object with each load_item included yield load_item
def parse_product(self, response): product_name = "Outdoor: " + response.xpath('//head/meta[@property="og:title"]/@content').extract_first() #Remove registered trade mark from string since it arrives as numeric #representation of the non-ascii character if "®" in product_name: product_name_without_symbols = product_name.replace("®", "") product_name = product_name_without_symbols product_url = response.xpath('//head/meta[@property="og:url"]/@content').extract_first() image = response.xpath('//head/meta[@property="og:image"]/@content').extract_first() #Description response will have unicode characters in scrapy shell but they are parsed into utf-8 #when it is sent via ElasticSearch pipeline description = response.xpath('//head/meta[@property="twitter:description"]/@content').extract_first() #The following line will return the price(s) as a string in a list, and it may also have commas. original_price_span = response.xpath('//section[@class="simple-subset"]//span[@class="price-amount"]/text()').extract() #Some Pottery Barn products list a price range for different sizes #Products with a price range must be grabbed from a different section on product page #This function is intended to remove any commas and turn the strings into integers #The function also grabs the low and high price if original_price_span is empty def modifyPriceList (thePriceSpan): #If the following condition is true #It indicates that a price range is in place for the product if thePriceSpan == []: price_container = [] #Grabs the object as a string from script element on page #Object contains both high price and low price load_string_with_pricing = response.xpath('//script[@type="application/ld+json"]/text()').extract_first() #Converts string element to dictionary convert_string_to_dictionary = ast.literal_eval(load_string_with_pricing) #Grab highest and lowest price low_price = convert_string_to_dictionary['offers']['lowPrice'] high_price = convert_string_to_dictionary['offers']['highPrice'] #Add low price and high price to price_conteinr list price_container.extend([low_price, high_price]) #Convert low price and high price to floating numbers price_container = list(map(float, price_container)) return price_container #Whenever there are two numbers in the list #The first one has to be deleted because it is not #the current marked down price if len(thePriceSpan) == 2: del thePriceSpan[:1] #Removes original price range, and extracts the new marked down price range if len(thePriceSpan) == 4: del thePriceSpan[:2] price_container = [] for price in thePriceSpan: #Takes commas out of price price_without_commas = price.replace(',','') price_container.append(price_without_commas) #Changes each number in price container to a floating number price_container = list(map(float, price_container)) return price_container final_price_span = modifyPriceList(original_price_span) #Most of the eco-products on 2modern are FSC Certified #Currently no Greenguard Products have been noted for this store certifications = [{"certification": "Forest Stewardship (FSC) Certified", "title": "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests.", "url": "https://us.fsc.org/en-us/what-we-do/mission-and-vision"}] #In order for elasticsearch to sort the products by price amount it will need the first #one in the array if there is a price range for the product lowest_price = final_price_span[0] load_item = productSpiderItem() load_item['sitename'] = 'Pottery Barn' load_item['productname'] = product_name load_item['producturl'] = product_url load_item['image'] = image load_item['price'] = final_price_span load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = lowest_price #Returns object with each load_item included yield load_item
def parse_product(self, response): #Check the products description section on the page to make sure it is FSC Certified check_for_fsc_cert = response.xpath( '//div[@class="accordion-tab-copy"]//ul/li[text()[contains(.,"FSC")]]' ).extract() if check_for_fsc_cert == []: return product_name = response.xpath( '//meta[@property="og:title"]/@content').extract_first() product_url = response.xpath( '//meta[@property="og:url"]/@content').extract_first() image = response.xpath( '//meta[@property="og:image"]/@content').extract_first() description = response.xpath( '//meta[@name="description"]/@content').extract_first() original_price = response.xpath( '//div[@class="pip-summary"]//span[@class="price-amount"]/text()' ).extract() def modifyPriceList(thePriceSpan): #If the following condition is true #It indicates that a price range is in place for the product if thePriceSpan == []: price_container = [] #Grabs the object as a string from script element on page #Object contains both high price and low price load_string_with_pricing = response.xpath( '//script[@type="application/ld+json"]/text()' ).extract_first() #Converts string element to dictionary convert_string_to_dictionary = ast.literal_eval( load_string_with_pricing) #Grab highest and lowest price low_price = convert_string_to_dictionary['offers']['lowPrice'] high_price = convert_string_to_dictionary['offers'][ 'highPrice'] #Add low price and high price to price_conteinr list price_container.extend([low_price, high_price]) #Convert low price and high price to floating numbers price_container = list(map(float, price_container)) return price_container #Whenever there are two numbers in the list #The first one has to be deleted because it is not #the current marked down price if len(thePriceSpan) == 2: del thePriceSpan[:1] #Removes original price range, and extracts the new marked down price range if len(thePriceSpan) == 4: del thePriceSpan[:2] price_container = [] for price in thePriceSpan: #Takes commas out of price price_without_commas = price.replace(',', '') price_container.append(price_without_commas) #Changes each number in price container to a floating number price_container = list(map(float, price_container)) return price_container final_price = modifyPriceList(original_price) #Most of the eco-products on davincibaby are GREENGUARD Gold Certified certifications = [{ "certification": "Forest Stewardship (FSC) Certified", "title": "FSC Certified products help reduce deforestation by ensuring products have been manufactured with recycled wood materials or have originated from sustainably managed forests. For more information click here", "url": "https://us.fsc.org/en-us/what-we-do/mission-and-vision" }] load_item = productSpiderItem() load_item['sitename'] = 'Williams Sonoma' load_item['productname'] = product_name load_item['producturl'] = product_url load_item['image'] = image load_item['price'] = final_price load_item['certifications'] = certifications load_item['description'] = description load_item['lowestprice'] = final_price[0] #Returns object with each load_item included yield load_item