def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl tesco -a csv_file=tesco_input.csv Input CSV file should be in supermarket_scraper/input directory. If CSV file not specified, defaults to {name}_input.csv e.g. tesco_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(TescoSpider, self).__init__(*args, **kwargs) if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise TescoSpiderError("Invalid output directory: " + self.output_dir)
def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl waitrose -a csv_file=waitrose_input.csv Input CSV file should be in data directory. If CSV file not specified, defaults to {name}_input.csv e.g. waitrose_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(WaitroseSpider, self).__init__(*args, **kwargs) ########## Fix for infinite scrolling ############# self.display = Display(visible=0, size=(1920, 1080)) self.display.start() self.driver = webdriver.Firefox() self.driver.wait = WebDriverWait(self.driver, 5) #self.driver.maximize_window() self.driver.set_window_size(1920, 1080) time.sleep(3) self.tb = 'tb none' dispatcher.connect(self.spider_closed, signals.spider_closed) ########## Fix for infinite scrolling ############# if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise WaitroseSpiderError("Invalid output directory: " + self.output_dir)
class WaitroseSpider(CrawlSpider): """WaitroseSpider =========== Main spider for crawling Waitrose website and searching for products. Settings for XPaths etc are supplied from SearchSettingsFactory below. Search parameters for products are supplied from SearchTreeFactory. Spider yields ProductItem for each product line. Pipelines exist to post-process data and write it to CSV or MongoDB. """ name = 'waitrose' store = "WAITROSE" output_dir = None settings = WaitroseSearchSettings() def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl waitrose -a csv_file=waitrose_input.csv Input CSV file should be in data directory. If CSV file not specified, defaults to {name}_input.csv e.g. waitrose_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(WaitroseSpider, self).__init__(*args, **kwargs) ########## Fix for infinite scrolling ############# self.display = Display(visible=0, size=(1920, 1080)) self.display.start() self.driver = webdriver.Firefox() self.driver.wait = WebDriverWait(self.driver, 5) #self.driver.maximize_window() self.driver.set_window_size(1920, 1080) time.sleep(3) self.tb = 'tb none' dispatcher.connect(self.spider_closed, signals.spider_closed) ########## Fix for infinite scrolling ############# if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise WaitroseSpiderError("Invalid output directory: " + self.output_dir) def get_searches(self): """Returns a LIST of searches. We don't need to nest searches here because Waitrose website allows us to construct URLs directly, instead of having to navigate through several layers of menus.""" if self.csv_file: log.msg("Spider: Fetching searches from " + self.csv_file, level=log.DEBUG) return self.search_factory.get_csv_searches() else: #Use some other source for target URLs - database? raise WaitroseSpiderError("Cannot find input file " + self.csv_file) def start_requests(self): """Generates crawler requests for given base URL and parses results.""" #search_list = self.get_searches() # Build URLs based on base URL + sub-categories #for s in search_list: # search_meta = {} # product_url = '' # search_meta = s.get_meta_map() product1_url = "http://www.waitrose.com/shop/Browse/Groceries/" log.msg("Spider: start_requests() yielding URL: " + product1_url, level=log.DEBUG) yield Request(url=product1_url) def parse_start_url(self, response): """Default function to parse responses from base URL: Waitrose serves products in a single list, but we cannot scroll through them and there is no 'Next page' link, so we just extract the first set of up to 24 product items and yield them for processing.""" ########## Fix for infinite scrolling ############# search_list = self.get_searches() for s in search_list: search_meta = {} product_url = '' metadata = s.get_meta_map() product_url = '/'.join([ self.settings.base_url, s.store_sub1, s.store_sub2, s.store_sub3 ]) + '/' self.driver.maximize_window() time.sleep(1) self.driver.get(product_url) time.sleep(2) log.msg("Spider: parse_start_url :: " + product_url, level=log.DEBUG) sel = Selector(text=self.driver.page_source) #i=0 while True: try: #i = i + 1 next_element = self.driver.find_element_by_xpath( self.settings.next_page_xpath) debug_text_class = next_element.get_attribute('href') #log.msg("Spider: parse_start_url :: Inside while :: next element"+str(debug_text_class), level=log.DEBUG) self.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") try: button = self.driver.wait.until( EC.element_to_be_clickable( (By.XPATH, self.settings.next_page_xpath))) button.click() except: self.tb = traceback.format_exc() #print '------------------inside button click exception i count--------------' ,i #print 'ERROR TRACE ::: ',self.tb #log.msg("Spider: parse_start_url :: Inside Exception handling :: Load more button Click "+str(self.tb), level=log.DEBUG) break time.sleep(2) except NoSuchElementException: self.tb = traceback.format_exc() #print '------------------ End of infinite scrolling/NoSuchElementException :: i count --------------' ,i #print 'ERROR TRACE ::: ',self.tb #log.msg("Spider: parse_start_url :: End of infinite scrolling/NoSuchElementException "+str(self.tb), level=log.DEBUG) break except: self.tb = traceback.format_exc() #print '------------------inside Exception handling:: i count --------------' ,i #print 'ERROR TRACE ::: ',self.tb #log.msg("Spider: parse_start_url :: inside infinite scrolling exception handling "+ str(self.tb), level=log.DEBUG) break sel = Selector(text=self.driver.page_source) products = sel.xpath(self.settings.products_xpath) log.msg("Spider: parsing response for URL: " + response.url + " for ONS item " + metadata['ons_item_name'], level=log.DEBUG) product_counter = len(products) #print 'Spider: parsing response for URL: total no. of products:: ',product_counter log.msg("Spider: parse_start_url :: total no. of products:: " + str(product_counter), level=log.DEBUG) for product in products: # Create an item for each entry item = ProductItem() #UPPER case product name for storage to make searching easier try: item['product_name'] = (product.xpath( self.settings.product_name_xpath).extract()[0] ).upper() except: continue log.msg("Spider: Response for URL: " + response.url + " found " + item['product_name'].encode('utf-8'), level=log.DEBUG) try: item['store'] = self.store item['ons_item_no'] = metadata['ons_item_no'] item['ons_item_name'] = metadata['ons_item_name'] item['product_type'] = metadata['store_sub3'] item['search_string'] = metadata['search_terms'] except: continue #Default matches to 1.0 and modify later try: item['search_matches'] = 1.0 # Save price string and convert it to number later item['item_price_str'] = product.xpath( self.settings.raw_price_xpath).extract()[0].strip() x = item['item_price_str'][0] #print('test', x) #pos = item['item_price_str'].index('\xc2') #item['item_price_str'] = item['item_price_str'][:].strip() #print(item['item_price_str'][4]) if item['item_price_str'][0] == 'N': item['item_price_str'] = item['item_price_str'][ 3:].strip() else: item['item_price_str'] = item[ 'item_price_str'][:].strip() # Try getting the volume and putting it on the end of the product name volume = product.xpath( self.settings.volume_xpath).extract() if volume: item['product_name'] = item[ 'product_name'] + " " + volume[0].strip().upper() except: continue # Waitrose volume price not always provided, so if it is not there, # we try using volume and item price instead. try: item['volume_price'] = '' vol_price = product.xpath( self.settings.vol_price_xpath).extract() if vol_price: #Allow for e.g. "1.25 per litre" instead of "1.25/litre" item['volume_price'] = (vol_price[0].strip()).replace( "per", "/") else: item['volume_price'] = item[ 'item_price_str'] + "/" + volume[0].strip() # Add timestamp item['timestamp'] = datetime.datetime.now() # Get promotion text (if any) NOT YET IMPLEMENTED item['promo'] = '' if self.settings.promo_xpath: promo = product.xpath( self.settings.promo_xpath).extract() #TODO if promo: item['promo'] = promo[0] # Get short term offer (if any) NOT YET IMPLEMENTED item['offer'] = '' if self.settings.offer_xpath: offer = product.xpath( self.settings.offer_xpath).extract() #TODO if offer: item['offer'] = offer[0] except: continue #Pass the item back product_counter = product_counter - 1 yield item def spider_closed(self, spider): self.display.stop() self.driver.quit()
class SainsburySpider(CrawlSpider): """SainsburySpider =========== Main spider for crawling Tecso store website and searching for products. Settings for XPaths etc are supplied from SearchSettingsFactory below. Search parameters for products are supplied from sainsburySearchTreeFactory. Spider yields sainsburyItem for each product line. Pipelines exist to post-process data and write it to CSV or MongoDB. """ name = 'sainsbury' store = "SAINSBURY" settings = SainsburySearchSettings() output_dir = None def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl sainsbury -a csv_file=sainsbury_input.csv Input CSV file should be in supermarket_scraper/input directory. If CSV file not specified, defaults to {name}_input.csv e.g. sainsbury_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(SainsburySpider, self).__init__(*args, **kwargs) ## selenium self.display = Display(visible=0, size=(1920, 1080)) self.display.start() self.driver = webdriver.Firefox() self.driver.wait = WebDriverWait(self.driver, 5) #self.driver.maximize_window() self.driver.set_window_size(1920, 1080) time.sleep(20) self.tb = 'tb none' dispatcher.connect(self.spider_closed, signals.spider_closed) #i=0 if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise SainsburySpiderError("Invalid output directory: " + self.output_dir) def get_searches(self): """Returns a LIST of searches. We don't need to nest searches here because Sainsbury website allows us to identify URLs directly, instead of having to navigate through several layers of menus.""" if self.csv_file: log.msg("Spider: Fetching searches from " + self.csv_file, level=log.DEBUG) return self.search_factory.get_csv_searches() else: #Use some other source for target URLs - database? raise SainsburySpiderError("Cannot find input file " + self.csv_file) def start_requests(self): """Generates crawler requests for given base URL and parses results.""" sb_cookies = self.settings.cookies product_url = "http://www.sainsburys.co.uk" log.msg("Spider: start_requests() yielding URL: " + product_url, level=log.DEBUG) yield Request(url=product_url, cookies=sb_cookies, callback=self.parse_base) def parse_base(self, response): """Default function to parse responses from base URL: Waitrose serves products in a single list, but we cannot scroll thro ugh them and there is no 'Next page' link, so we just extract the first set of up to 24 product items and yield them for processing.""" search_list = self.get_searches() for s in search_list: search_meta = {} product_url = '' search_meta = s.get_meta_map() product_url = s.store_sub3 self.driver.get(product_url) sel = Selector(text=self.driver.page_source) first_page_parse_finished = None log.msg("Spider: start_requests() yielding URL:" + product_url, level=log.DEBUG) while True: try: if first_page_parse_finished: #Find any "next" links for paging next_element = self.driver.find_element_by_xpath( self.settings.next_page_xpath) debug_text_class = next_element.get_attribute('href') button = self.driver.wait.until( EC.element_to_be_clickable( (By.XPATH, self.settings.next_page_xpath))) button.click() time.sleep(3) first_page_parse_finished = True sel = Selector(text=self.driver.page_source) products = sel.xpath(self.settings.products_xpath) for product in products: # Create an item for each entry item = ProductItem() item['store'] = self.store #print('store field of item object', item['store']) item['ons_item_no'] = search_meta['ons_item_no'] item['ons_item_name'] = search_meta['ons_item_name'] item['product_type'] = search_meta['store_sub3'] item['search_string'] = search_meta['search_terms'] #Default matches to 1.0 and modify later #item['search_matches'] = 1.0 #UPPER case product name for storage to make searching easier prodname = product.xpath( self.settings.product_name_xpath).extract() if len(prodname) > 0: item['product_name'] = prodname[0].upper().strip() #print 'SPIDER :: sainsbury :: product_name',format(item['product_name'].encode('utf-8')) # WARNING: Prices format is much more complicated on Sainsburys # pages, so we have to do multiple layers of extraction here to # get the prices while we still have access to the XPaths etc. price_block = product.xpath( self.settings.raw_price_xpath) raw_price_block = price_block[0] vol_price_block = price_block[1] #price_block[0] #price_block[1] #print('individual item prices ', raw_price_block) #print('individual volume item prices ', vol_price_block) #Extract a raw price ppu_price = raw_price_block.xpath('text()')[0] ppu_unit = raw_price_block.xpath( '*/span[@class="pricePerUnitUnit"]/text()')[0] item['item_price_str'] = ppu_price.extract().strip( ) + '/' + ppu_unit.extract().strip() #print('individual item prices processed', item['item_price_str']) #Extract the components of the volume price e.g. 1.50 per 100g #THIS WILL BREAK IF PRICE FORMAT ON PAGE CHANGES! vol_abbr = vol_price_block.xpath( 'text()').extract() #print('volume_unit_raw', vol_abbr ) if vol_abbr[0].strip(): vol_price = vol_abbr[0].strip() if vol_abbr[1].strip(): vol_price = vol_price + ' / ' + vol_abbr[1] else: #default std quantity to 1 vol_price = vol_price + ' / 1 ' #Get the volume units as well #exception added as the last two unit_vol's were not collecting, this adds an NA in when this is the case and parses to the next product try: vol_unit = product.xpath( self.settings.vol_unit)[2] vol_price = vol_price + vol_unit.extract( ).strip() except: #default std quantity to 1 vol_unit = "NA" vol_price = vol_price + vol_unit #Get the volume units as well #print('vol_unit', vol_unit) #print('vol _nunit', vol_unit) #vol_price_block.xpath("*/span[@class='pricePerMeasureMeasure']/text()") #Construct the vol price in known format and save it to the item item['volume_price'] = vol_price #print('vol _nunit', item['volume_price']) # Add timestamp item['timestamp'] = datetime.datetime.now() #Ignore promos/offers item['promo'] = product.xpath( self.settings.promo_xpath).extract() item['offer'] = product.xpath( self.settings.offer_xpath).extract() #Pass the item back yield item except NoSuchElementException: #print 'Inside NoSuchElementException handling::: ' break except: self.tb = traceback.format_exc() log.msg( "Spider: parse request :Inside Exception handling:::" + self.tb, level=log.DEBUG) #print 'Inside Exception handling::: ',self.tb break def spider_closed(self, spider): #print "--- %s seconds ---" % (time.time() - start_time)) self.display.stop() self.driver.quit()
class TescoSpider(CrawlSpider): """TescoSpider =========== Main spider for crawling Tecso store website and searching for products. Settings for XPaths etc are supplied from SearchSettingsFactory below. Search parameters for products are supplied from TescoSearchTreeFactory. Spider yields TescoItem for each product line. Pipelines exist to post-process data and write it to CSV or MongoDB. """ name = 'tesco' store = "TESCO" settings = TescoSearchSettings() output_dir = None def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl tesco -a csv_file=tesco_input.csv Input CSV file should be in supermarket_scraper/input directory. If CSV file not specified, defaults to {name}_input.csv e.g. tesco_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(TescoSpider, self).__init__(*args, **kwargs) if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise TescoSpiderError("Invalid output directory: " + self.output_dir) def get_searches(self): """Returns a tree of searches.""" if self.csv_file: log.msg("Spider: Fetching searches from " + self.csv_file, level=log.DEBUG) return self.search_factory.get_csv_search_tree( self.settings.base_url) else: #Use some other source for target URLs - database? raise TescoSpiderError("Cannot find input file " + self.csv_file) def start_requests(self): """Generates crawler request for given base URL and parse results.""" yield Request(url=self.settings.base_url, callback=self.parse_base) def parse_base(self, response): """Parse responses from base URL: Overrides Scrapy parser to parse each crawled response. Extracts search details from response. Looks for next layer of search data (sub 1). Yield new Request to fetch required sub-set of data.""" sel = Selector(response) #Get list of searches as a NESTED TREE searches = self.get_searches() #Find first layer of subordinate data (via nav links) #Process each navigation item to find required sub-category sub_items = sel.xpath(self.settings.sub1_path) for item in sub_items: # Check each nav link for the required sub-category # Text is returned as a list of strings so join it into a single string link_text = ' '.join(item.xpath('text()').extract()) # Check search tree i.e. children of top node will be sub1 entries for s in searches.children: if (link_text == s.name): search_meta = s.as_dict() link_ref = item.xpath('@href').extract()[0] url = link_ref #print("parse_base: Text matches so use URL:",url) yield Request(url, meta=search_meta, callback=self.parse_sub1) def parse_sub1(self, response): """Parse responses from SUB1 URL: Overrides Scrapy parser to parse each crawled response. Extracts search details from response. Looks for next layer of search data (sub 2). Yield new Request to fetch required sub-set of data.""" sel = Selector(response) #Find required subordinate data (nav links) sub_items = sel.xpath(self.settings.sub2_path) for item in sub_items: #Check each nav link for the required sub-category link_text = ' '.join(item.xpath('text()').extract()) # Check search tree i.e. children of this node will be sub2 entries for s in response.meta['children']: #print("Sub 2: Checking link text:", link_text, "against", s['name']) if (link_text.encode('utf-16') == s['name'].encode('utf-16')): search_meta = s link_ref = item.xpath('@href').extract()[0] url = link_ref #print "parse_sub1: Found nav link link: ", url yield Request(url, meta=search_meta, callback=self.parse_sub2) def parse_sub2(self, response): """Parse responses from SUB2 URL: Overrides Scrapy parser to parse each crawled response. Extracts search details from response. Looks for next layer of search data (sub 2). Yield new Request to fetch required sub-set of data.""" sel = Selector(response) #Find required subordinate data (nav links) sub_items = sel.xpath(self.settings.sub3_path) for item in sub_items: #Check each nav link for the required sub-category link_text = ' '.join(item.xpath('text()').extract()) # Check search tree i.e. children of this node will be sub3 entries for s in response.meta['children']: #print("Sub 3: Checking link text:", link_text, "against", s['name']) if (link_text.encode('utf-16') == s['name'].encode('utf-16')): search_meta = s link_ref = item.xpath('@href').extract()[0] url = link_ref #print "parse_sub2: Found nav link link: ", url yield Request(url, meta=search_meta, callback=self.parse_sub3) def parse_sub3(self, response): """Parse responses from SUB3 URL: Overrides Scrapy parser to parse each crawled response. Extracts search details from response. Searches for required product within resutls for this sub-category. Yield a ProductItem for each product item extracted. Yield another request for any "next" page links.""" sel = Selector(response) #Find any "next" links for paging and yield Request to next page next_page = sel.xpath(self.settings.next_page_xpath) for page in next_page: #Check each nav link for the required sub-category next_link_ref = page.xpath('@href').extract()[0] #print "Found nav link link: ", url yield Request(next_link_ref, meta=response.meta, callback=self.parse_sub3) #Finds product lines products = sel.xpath(self.settings.products_xpath) #Process each product line # Get details of current search (passed in via response meta data) metadata = response.meta['data'] for product in products: #print('**in the item loop**') #print(product.xpath(self.settings.raw_price_xpath).extract()[0]) # Create an item for each entry item = ProductItem() item['store'] = self.store item['ons_item_no'] = metadata['ons_item_no'] item['ons_item_name'] = metadata['ons_item_name'] item['product_type'] = metadata['store_sub3'] item['search_string'] = metadata['search_terms'] #Default matches to 1.0 and modify later #item['search_matches'] = 1.0 #UPPER case product name for storage to make searching easier item['product_name'] = (product.xpath( self.settings.product_name_xpath).extract()[0]).upper() # Save price string and convert it to number later # need to account for parsing error try: item['item_price_str'] = product.xpath( self.settings.raw_price_xpath).extract()[0] except: continue # Extract raw price by weight or volume try: item['volume_price'] = product.xpath( self.settings.vol_price_xpath).extract()[0] except: continue # Add timestamp item['timestamp'] = datetime.datetime.now() # Get promotion text (if any) promo = product.xpath(self.settings.promo_xpath).extract() if promo: item['promo'] = promo[0] else: item['promo'] = '' # Get short term offer (if any) offer = product.xpath(self.settings.offer_xpath).extract() if offer: item['offer'] = offer[0] else: item['offer'] = '' #Pass the item back yield item
class SainsburySpider(CrawlSpider): """SainsburySpider =========== Main spider for crawling Tecso store website and searching for products. Settings for XPaths etc are supplied from SearchSettingsFactory below. Search parameters for products are supplied from sainsburySearchTreeFactory. Spider yields sainsburyItem for each product line. Pipelines exist to post-process data and write it to CSV or MongoDB. """ name = 'sainsbury' store = "SAINSBURY" settings = SainsburySearchSettings() output_dir = None def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl sainsbury -a csv_file=sainsbury_input.csv Input CSV file should be in supermarket_scraper/input directory. If CSV file not specified, defaults to {name}_input.csv e.g. sainsbury_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(SainsburySpider, self).__init__(*args, **kwargs) if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise SainsburySpiderError("Invalid output directory: " + self.output_dir) def get_searches(self): """Returns a LIST of searches. We don't need to nest searches here because Sainsbury website allows us to identify URLs directly, instead of having to navigate through several layers of menus.""" if self.csv_file: log.msg("Spider: Fetching searches from " + self.csv_file, level=log.DEBUG) return self.search_factory.get_csv_searches() else: #Use some other source for target URLs - database? raise SainsburySpiderError("Cannot find input file " + self.csv_file) def start_requests(self): """Generates crawler requests for given base URL and parses results.""" search_list = self.get_searches() a = str(search_list) #print('gets urls and parses the repsonse objects to form a list: ', a) sb_cookies = self.settings.cookies # Build URLs based on base URL + sub-categories for s in search_list: search_meta = {} product_url = '' search_meta = s.get_meta_map() search_meta['cookiejar'] = 1 product_url = s.store_sub3 # print urls #print('product:' ,product_url) log.msg("Spider: start_requests() yielding URL: " + product_url, level=log.DEBUG) yield Request(url=product_url, cookies=sb_cookies, meta=search_meta, callback=self.parse_base) def parse_base(self, response): """Default function to parse responses from base URL: Waitrose serves products in a single list, but we cannot scroll through them and there is no 'Next page' link, so we just extract the first set of up to 24 product items and yield them for processing.""" # Get details of current search (passed in via response meta data) metadata = response.meta #Find product lines sel = Selector(response) # rb test, only collecting three item (object) responses #print('test reponse objects from spider: ',sel) sb_cookies = self.settings.cookies #Find any "next" links for paging and yield Request to next page next_page = sel.xpath(self.settings.next_page_xpath) for page in next_page: #Check each nav link for the required sub-category next_link_ref = page.xpath('@href').extract()[0] log.msg("Spider: found NEXT page link: " + next_link_ref, level=log.DEBUG) yield Request(next_link_ref, cookies=sb_cookies, meta=response.meta, callback=self.parse_base) #Process each product line log.msg("Spider: parsing response for URL: " + response.url + " for ONS item " + metadata['ons_item_name'], level=log.DEBUG) products = sel.xpath(self.settings.products_xpath) for product in products: # Create an item for each entry item = ProductItem() item['store'] = self.store #print('store field of item object', item['store']) item['ons_item_no'] = metadata['ons_item_no'] item['ons_item_name'] = metadata['ons_item_name'] item['product_type'] = metadata['store_sub3'] item['search_string'] = metadata['search_terms'] #Default matches to 1.0 and modify later #item['search_matches'] = 1.0 #UPPER case product name for storage to make searching easier prodname = product.xpath( self.settings.product_name_xpath).extract() if len(prodname) > 0: item['product_name'] = prodname[0].upper().strip() #print('individual item product names: ', item['product_name']) # WARNING: Prices format is much more complicated on Sainsburys # pages, so we have to do multiple layers of extraction here to # get the prices while we still have access to the XPaths etc. price_block = product.xpath(self.settings.raw_price_xpath) raw_price_block = price_block[0] vol_price_block = price_block[1] #price_block[0] #price_block[1] #print('individual item prices ', raw_price_block) #print('individual volume item prices ', vol_price_block) #Extract a raw price ppu_price = raw_price_block.xpath('text()')[0] ppu_unit = raw_price_block.xpath( '*/span[@class="pricePerUnitUnit"]/text()')[0] item['item_price_str'] = ppu_price.extract().strip( ) + '/' + ppu_unit.extract().strip() #print('individual item prices processed', item['item_price_str']) #Extract the components of the volume price e.g. 1.50 per 100g #THIS WILL BREAK IF PRICE FORMAT ON PAGE CHANGES! vol_abbr = vol_price_block.xpath('text()').extract() #print('volume_unit_raw', vol_abbr ) if vol_abbr[0].strip(): vol_price = vol_abbr[0].strip() if vol_abbr[1].strip(): vol_price = vol_price + ' / ' + vol_abbr[1] else: #default std quantity to 1 vol_price = vol_price + ' / 1 ' #Get the volume units as well #exception added as the last two unit_vol's were not collecting, this adds an NA in when this is the case and parses to the next product try: vol_unit = product.xpath(self.settings.vol_unit)[2] vol_price = vol_price + vol_unit.extract().strip() except: #default std quantity to 1 vol_unit = "NA" vol_price = vol_price + vol_unit #Get the volume units as well #print('vol_unit', vol_unit) #print('vol _nunit', vol_unit) #vol_price_block.xpath("*/span[@class='pricePerMeasureMeasure']/text()") #Construct the vol price in known format and save it to the item item['volume_price'] = vol_price #print('vol _nunit', item['volume_price']) # Add timestamp item['timestamp'] = datetime.datetime.now() #Ignore promos/offers item['promo'] = product.xpath( self.settings.promo_xpath).extract() item['offer'] = product.xpath( self.settings.offer_xpath).extract() #Pass the item back yield item
class WaitroseSpider(CrawlSpider): """WaitroseSpider =========== Main spider for crawling Waitrose website and searching for products. Settings for XPaths etc are supplied from SearchSettingsFactory below. Search parameters for products are supplied from SearchTreeFactory. Spider yields ProductItem for each product line. Pipelines exist to post-process data and write it to CSV or MongoDB. """ name = 'waitrose' store = "WAITROSE" output_dir = None settings = WaitroseSearchSettings() def __init__(self, csv_file=None, *args, **kwargs): """Can provide name of input CSV file at runtime e.g.: scrapy crawl waitrose -a csv_file=waitrose_input.csv Input CSV file should be in data directory. If CSV file not specified, defaults to {name}_input.csv e.g. waitrose_input.csv. Output files are written to: supermarket_scraper/output/[spider name] Output directory MUST EXIST! """ super(WaitroseSpider, self).__init__(*args, **kwargs) if csv_file: self.csv_file = csv_file else: self.csv_file = self.name + "_input.csv" # Get URL and XPath settings self.settings = SearchSettingsFactory.get_settings(self.store) # Get search parameters as tree self.search_factory = SearchTreeFactory(self.store, self.csv_file) # Set and check output directory self.output_dir = os.path.join('output', self.name) if not (os.path.isdir(self.output_dir)): raise WaitroseSpiderError("Invalid output directory: " + self.output_dir) def get_searches(self): """Returns a LIST of searches. We don't need to nest searches here because Waitrose website allows us to construct URLs directly, instead of having to navigate through several layers of menus.""" if self.csv_file: log.msg("Spider: Fetching searches from " + self.csv_file, level=log.DEBUG) return self.search_factory.get_csv_searches() else: #Use some other source for target URLs - database? raise WaitroseSpiderError("Cannot find input file " + self.csv_file) def start_requests(self): """Generates crawler requests for given base URL and parses results.""" search_list = self.get_searches() # Build URLs based on base URL + sub-categories for s in search_list: search_meta = {} product_url = '' search_meta = s.get_meta_map() product_url = '/'.join([ self.settings.base_url, s.store_sub1, s.store_sub2, s.store_sub3 ]) + '/' log.msg("Spider: start_requests() yielding URL: " + product_url, level=log.DEBUG) yield Request(url=product_url, meta=search_meta) def parse_start_url(self, response): """Default function to parse responses from base URL: Waitrose serves products in a single list, but we cannot scroll through them and there is no 'Next page' link, so we just extract the first set of up to 24 product items and yield them for processing.""" # Get details of current search (passed in via response meta data) metadata = response.meta #Find product lines sel = Selector(response) products = sel.xpath(self.settings.products_xpath) #Process each product line log.msg("Spider: parsing response for URL: " + response.url + " for ONS item " + metadata['ons_item_name'], level=log.DEBUG) for product in products: # Create an item for each entry item = ProductItem() #UPPER case product name for storage to make searching easier try: item['product_name'] = (product.xpath( self.settings.product_name_xpath).extract()[0]).upper() except: continue log.msg("Spider: Response for URL: " + response.url + " found " + item['product_name'].encode('utf-16'), level=log.DEBUG) try: item['store'] = self.store item['ons_item_no'] = metadata['ons_item_no'] item['ons_item_name'] = metadata['ons_item_name'] item['product_type'] = metadata['store_sub3'] item['search_string'] = metadata['search_terms'] except: continue #Default matches to 1.0 and modify later try: item['search_matches'] = 1.0 # Save price string and convert it to number later item['item_price_str'] = product.xpath( self.settings.raw_price_xpath).extract()[0].strip() x = item['item_price_str'][0] #print('test', x) #pos = item['item_price_str'].index('\xc2') #item['item_price_str'] = item['item_price_str'][:].strip() #print(item['item_price_str'][4]) if item['item_price_str'][0] == 'N': item['item_price_str'] = item['item_price_str'][3:].strip() else: item['item_price_str'] = item['item_price_str'][:].strip() # Try getting the volume and putting it on the end of the product name volume = product.xpath(self.settings.volume_xpath).extract() if volume: item['product_name'] = item['product_name'] + " " + volume[ 0].strip().upper() except: continue # Waitrose volume price not always provided, so if it is not there, # we try using volume and item price instead. try: item['volume_price'] = '' vol_price = product.xpath( self.settings.vol_price_xpath).extract() if vol_price: #Allow for e.g. "1.25 per litre" instead of "1.25/litre" item['volume_price'] = (vol_price[0].strip()).replace( "per", "/") else: item['volume_price'] = item[ 'item_price_str'] + "/" + volume[0].strip() # Add timestamp item['timestamp'] = datetime.datetime.now() # Get promotion text (if any) NOT YET IMPLEMENTED item['promo'] = '' if self.settings.promo_xpath: promo = product.xpath( self.settings.promo_xpath).extract() #TODO if promo: item['promo'] = promo[0] # Get short term offer (if any) NOT YET IMPLEMENTED item['offer'] = '' if self.settings.offer_xpath: offer = product.xpath( self.settings.offer_xpath).extract() #TODO if offer: item['offer'] = offer[0] except: continue #Pass the item back yield item