예제 #1
0
    def __init__(self, csv_file=None, *args, **kwargs):
        """Can provide name of input CSV file at runtime e.g.:
        
           scrapy crawl tesco -a csv_file=tesco_input.csv
           
           Input CSV file should be in supermarket_scraper/input directory. 
           If CSV file not specified, defaults to {name}_input.csv 
           e.g. tesco_input.csv.
           
           Output files are written to:
           
           supermarket_scraper/output/[spider name]
           
           Output directory MUST EXIST!
        """
        super(TescoSpider, self).__init__(*args, **kwargs)

        if csv_file:
            self.csv_file = csv_file
        else:
            self.csv_file = self.name + "_input.csv"

        # Get URL and XPath settings
        self.settings = SearchSettingsFactory.get_settings(self.store)
        # Get search parameters as tree
        self.search_factory = SearchTreeFactory(self.store, self.csv_file)
        # Set and check output directory
        self.output_dir = os.path.join('output', self.name)
        if not (os.path.isdir(self.output_dir)):
            raise TescoSpiderError("Invalid output directory: " +
                                   self.output_dir)
예제 #2
0
    def __init__(self, csv_file=None, *args, **kwargs):
        """Can provide name of input CSV file at runtime e.g.:
        
           scrapy crawl waitrose -a csv_file=waitrose_input.csv
           
           Input CSV file should be in data directory. 
           If CSV file not specified, defaults to {name}_input.csv 
           e.g. waitrose_input.csv.
           
           Output files are written to:
           
           supermarket_scraper/output/[spider name]
           
           Output directory MUST EXIST!
        """
        super(WaitroseSpider, self).__init__(*args, **kwargs)
        ########## Fix for infinite scrolling #############
        self.display = Display(visible=0, size=(1920, 1080))
        self.display.start()
        self.driver = webdriver.Firefox()
        self.driver.wait = WebDriverWait(self.driver, 5)
        #self.driver.maximize_window()
        self.driver.set_window_size(1920, 1080)
        time.sleep(3)
        self.tb = 'tb none'
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        ########## Fix for infinite scrolling #############
        if csv_file:
            self.csv_file = csv_file
        else:
            self.csv_file = self.name + "_input.csv"

        # Get URL and XPath settings
        self.settings = SearchSettingsFactory.get_settings(self.store)
        # Get search parameters as tree
        self.search_factory = SearchTreeFactory(self.store, self.csv_file)
        # Set and check output directory
        self.output_dir = os.path.join('output', self.name)
        if not (os.path.isdir(self.output_dir)):
            raise WaitroseSpiderError("Invalid output directory: " +
                                      self.output_dir)
예제 #3
0
class WaitroseSpider(CrawlSpider):
    """WaitroseSpider
       ===========
       Main spider for crawling Waitrose website and searching for products.
       Settings for XPaths etc are supplied from SearchSettingsFactory below.
       Search parameters for products are supplied from SearchTreeFactory.
       Spider yields ProductItem for each product line.
       Pipelines exist to post-process data and write it to CSV or MongoDB.
       """
    name = 'waitrose'
    store = "WAITROSE"
    output_dir = None
    settings = WaitroseSearchSettings()

    def __init__(self, csv_file=None, *args, **kwargs):
        """Can provide name of input CSV file at runtime e.g.:
        
           scrapy crawl waitrose -a csv_file=waitrose_input.csv
           
           Input CSV file should be in data directory. 
           If CSV file not specified, defaults to {name}_input.csv 
           e.g. waitrose_input.csv.
           
           Output files are written to:
           
           supermarket_scraper/output/[spider name]
           
           Output directory MUST EXIST!
        """
        super(WaitroseSpider, self).__init__(*args, **kwargs)
        ########## Fix for infinite scrolling #############
        self.display = Display(visible=0, size=(1920, 1080))
        self.display.start()
        self.driver = webdriver.Firefox()
        self.driver.wait = WebDriverWait(self.driver, 5)
        #self.driver.maximize_window()
        self.driver.set_window_size(1920, 1080)
        time.sleep(3)
        self.tb = 'tb none'
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        ########## Fix for infinite scrolling #############
        if csv_file:
            self.csv_file = csv_file
        else:
            self.csv_file = self.name + "_input.csv"

        # Get URL and XPath settings
        self.settings = SearchSettingsFactory.get_settings(self.store)
        # Get search parameters as tree
        self.search_factory = SearchTreeFactory(self.store, self.csv_file)
        # Set and check output directory
        self.output_dir = os.path.join('output', self.name)
        if not (os.path.isdir(self.output_dir)):
            raise WaitroseSpiderError("Invalid output directory: " +
                                      self.output_dir)

    def get_searches(self):
        """Returns a LIST of searches. We don't need to nest searches here
           because Waitrose website allows us to construct URLs directly,
           instead of having to navigate through several layers of menus."""
        if self.csv_file:
            log.msg("Spider: Fetching searches from " + self.csv_file,
                    level=log.DEBUG)
            return self.search_factory.get_csv_searches()
        else:
            #Use some other source for target URLs - database?
            raise WaitroseSpiderError("Cannot find input file " +
                                      self.csv_file)

    def start_requests(self):
        """Generates crawler requests for given base URL and parses results."""
        #search_list = self.get_searches()
        # Build URLs based on base URL + sub-categories
        #for s in search_list:
        #    search_meta = {}
        #    product_url = ''
        #    search_meta = s.get_meta_map()
        product1_url = "http://www.waitrose.com/shop/Browse/Groceries/"
        log.msg("Spider: start_requests() yielding URL: " + product1_url,
                level=log.DEBUG)
        yield Request(url=product1_url)

    def parse_start_url(self, response):
        """Default function to parse responses from base URL:
           Waitrose serves products in a single list, but we cannot scroll
           through them and there is no 'Next page' link, so we just extract
           the first set of up to 24 product items and yield them for processing."""
        ########## Fix for infinite scrolling  #############
        search_list = self.get_searches()
        for s in search_list:
            search_meta = {}
            product_url = ''
            metadata = s.get_meta_map()
            product_url = '/'.join([
                self.settings.base_url, s.store_sub1, s.store_sub2,
                s.store_sub3
            ]) + '/'
            self.driver.maximize_window()
            time.sleep(1)
            self.driver.get(product_url)
            time.sleep(2)
            log.msg("Spider: parse_start_url :: " + product_url,
                    level=log.DEBUG)
            sel = Selector(text=self.driver.page_source)
            #i=0
            while True:
                try:
                    #i = i + 1
                    next_element = self.driver.find_element_by_xpath(
                        self.settings.next_page_xpath)
                    debug_text_class = next_element.get_attribute('href')
                    #log.msg("Spider: parse_start_url :: Inside while :: next element"+str(debug_text_class), level=log.DEBUG)
                    self.driver.execute_script(
                        "window.scrollTo(0, document.body.scrollHeight);")
                    try:
                        button = self.driver.wait.until(
                            EC.element_to_be_clickable(
                                (By.XPATH, self.settings.next_page_xpath)))
                        button.click()
                    except:
                        self.tb = traceback.format_exc()
                        #print '------------------inside button click exception i count--------------' ,i
                        #print 'ERROR TRACE ::: ',self.tb
                        #log.msg("Spider: parse_start_url :: Inside Exception handling :: Load more button Click "+str(self.tb), level=log.DEBUG)
                        break

                    time.sleep(2)
                except NoSuchElementException:
                    self.tb = traceback.format_exc()
                    #print '------------------ End of infinite scrolling/NoSuchElementException :: i count --------------' ,i
                    #print 'ERROR TRACE ::: ',self.tb
                    #log.msg("Spider: parse_start_url :: End of infinite scrolling/NoSuchElementException "+str(self.tb), level=log.DEBUG)
                    break
                except:
                    self.tb = traceback.format_exc()
                    #print '------------------inside Exception handling:: i count --------------' ,i
                    #print 'ERROR TRACE ::: ',self.tb
                    #log.msg("Spider: parse_start_url :: inside infinite scrolling exception handling "+ str(self.tb), level=log.DEBUG)
                    break
            sel = Selector(text=self.driver.page_source)
            products = sel.xpath(self.settings.products_xpath)
            log.msg("Spider: parsing response for URL: " + response.url +
                    " for ONS item " + metadata['ons_item_name'],
                    level=log.DEBUG)
            product_counter = len(products)
            #print 'Spider: parsing response for URL: total no. of products:: ',product_counter
            log.msg("Spider: parse_start_url :: total no. of products:: " +
                    str(product_counter),
                    level=log.DEBUG)
            for product in products:
                # Create an item for each entry

                item = ProductItem()
                #UPPER case product name for storage to make searching easier
                try:
                    item['product_name'] = (product.xpath(
                        self.settings.product_name_xpath).extract()[0]
                                            ).upper()
                except:
                    continue

                log.msg("Spider: Response for URL: " + response.url +
                        " found " + item['product_name'].encode('utf-8'),
                        level=log.DEBUG)

                try:
                    item['store'] = self.store
                    item['ons_item_no'] = metadata['ons_item_no']
                    item['ons_item_name'] = metadata['ons_item_name']
                    item['product_type'] = metadata['store_sub3']
                    item['search_string'] = metadata['search_terms']

                except:
                    continue
        #Default matches to 1.0 and modify later

                try:
                    item['search_matches'] = 1.0
                    # Save price string and convert it to number later
                    item['item_price_str'] = product.xpath(
                        self.settings.raw_price_xpath).extract()[0].strip()
                    x = item['item_price_str'][0]
                    #print('test', x)
                    #pos = item['item_price_str'].index('\xc2')
                    #item['item_price_str'] = item['item_price_str'][:].strip()
                    #print(item['item_price_str'][4])
                    if item['item_price_str'][0] == 'N':
                        item['item_price_str'] = item['item_price_str'][
                            3:].strip()
                    else:
                        item['item_price_str'] = item[
                            'item_price_str'][:].strip()

        # Try getting the volume and putting it on the end of the product name
                    volume = product.xpath(
                        self.settings.volume_xpath).extract()
                    if volume:
                        item['product_name'] = item[
                            'product_name'] + " " + volume[0].strip().upper()
                except:
                    continue

        # Waitrose volume price not always provided, so if it is not there,
        # we try using volume and item price instead.
                try:
                    item['volume_price'] = ''
                    vol_price = product.xpath(
                        self.settings.vol_price_xpath).extract()
                    if vol_price:
                        #Allow for e.g. "1.25 per litre" instead of "1.25/litre"
                        item['volume_price'] = (vol_price[0].strip()).replace(
                            "per", "/")
                    else:
                        item['volume_price'] = item[
                            'item_price_str'] + "/" + volume[0].strip()

        # Add timestamp
                    item['timestamp'] = datetime.datetime.now()
                    # Get promotion text (if any) NOT YET IMPLEMENTED
                    item['promo'] = ''
                    if self.settings.promo_xpath:
                        promo = product.xpath(
                            self.settings.promo_xpath).extract()  #TODO
                        if promo:
                            item['promo'] = promo[0]
        # Get short term offer (if any) NOT YET IMPLEMENTED
                        item['offer'] = ''
                        if self.settings.offer_xpath:
                            offer = product.xpath(
                                self.settings.offer_xpath).extract()  #TODO
                            if offer:
                                item['offer'] = offer[0]
                except:
                    continue
        #Pass the item back
                product_counter = product_counter - 1
                yield item

    def spider_closed(self, spider):
        self.display.stop()
        self.driver.quit()
예제 #4
0
class SainsburySpider(CrawlSpider):
    """SainsburySpider
       ===========
       Main spider for crawling Tecso store website and searching for products.
       Settings for XPaths etc are supplied from SearchSettingsFactory below.
       Search parameters for products are supplied from sainsburySearchTreeFactory.
       Spider yields sainsburyItem for each product line.
       Pipelines exist to post-process data and write it to CSV or MongoDB.
       """
    name = 'sainsbury'
    store = "SAINSBURY"
    settings = SainsburySearchSettings()
    output_dir = None

    def __init__(self, csv_file=None, *args, **kwargs):
        """Can provide name of input CSV file at runtime e.g.:
        
           scrapy crawl sainsbury -a csv_file=sainsbury_input.csv
           
           Input CSV file should be in supermarket_scraper/input directory. 
           If CSV file not specified, defaults to {name}_input.csv 
           e.g. sainsbury_input.csv.
           
           Output files are written to:
           
           supermarket_scraper/output/[spider name]
           
           Output directory MUST EXIST!
        """
        super(SainsburySpider, self).__init__(*args, **kwargs)
        ## selenium
        self.display = Display(visible=0, size=(1920, 1080))
        self.display.start()
        self.driver = webdriver.Firefox()
        self.driver.wait = WebDriverWait(self.driver, 5)
        #self.driver.maximize_window()
        self.driver.set_window_size(1920, 1080)
        time.sleep(20)
        self.tb = 'tb none'
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        #i=0
        if csv_file:
            self.csv_file = csv_file
        else:
            self.csv_file = self.name + "_input.csv"

        # Get URL and XPath settings
        self.settings = SearchSettingsFactory.get_settings(self.store)
        # Get search parameters as tree
        self.search_factory = SearchTreeFactory(self.store, self.csv_file)
        # Set and check output directory
        self.output_dir = os.path.join('output', self.name)
        if not (os.path.isdir(self.output_dir)):
            raise SainsburySpiderError("Invalid output directory: " +
                                       self.output_dir)

    def get_searches(self):
        """Returns a LIST of searches. We don't need to nest searches here
           because Sainsbury website allows us to identify URLs directly,
           instead of having to navigate through several layers of menus."""
        if self.csv_file:
            log.msg("Spider: Fetching searches from " + self.csv_file,
                    level=log.DEBUG)
            return self.search_factory.get_csv_searches()
        else:
            #Use some other source for target URLs - database?
            raise SainsburySpiderError("Cannot find input file " +
                                       self.csv_file)

    def start_requests(self):
        """Generates crawler requests for given base URL and parses results."""

        sb_cookies = self.settings.cookies
        product_url = "http://www.sainsburys.co.uk"
        log.msg("Spider: start_requests() yielding URL: " + product_url,
                level=log.DEBUG)
        yield Request(url=product_url,
                      cookies=sb_cookies,
                      callback=self.parse_base)

    def parse_base(self, response):
        """Default function to parse responses from base URL:
           Waitrose serves products in a single list, but we cannot scroll
           thro ugh them and there is no 'Next page' link, so we just extract
           the first set of up to 24 product items and yield them for processing."""
        search_list = self.get_searches()
        for s in search_list:
            search_meta = {}
            product_url = ''
            search_meta = s.get_meta_map()
            product_url = s.store_sub3

            self.driver.get(product_url)
            sel = Selector(text=self.driver.page_source)
            first_page_parse_finished = None
            log.msg("Spider: start_requests() yielding URL:" + product_url,
                    level=log.DEBUG)
            while True:
                try:

                    if first_page_parse_finished:
                        #Find any "next" links for paging
                        next_element = self.driver.find_element_by_xpath(
                            self.settings.next_page_xpath)
                        debug_text_class = next_element.get_attribute('href')
                        button = self.driver.wait.until(
                            EC.element_to_be_clickable(
                                (By.XPATH, self.settings.next_page_xpath)))
                        button.click()
                        time.sleep(3)

                    first_page_parse_finished = True
                    sel = Selector(text=self.driver.page_source)
                    products = sel.xpath(self.settings.products_xpath)
                    for product in products:
                        # Create an item for each entry
                        item = ProductItem()
                        item['store'] = self.store
                        #print('store field of item object', item['store'])
                        item['ons_item_no'] = search_meta['ons_item_no']
                        item['ons_item_name'] = search_meta['ons_item_name']
                        item['product_type'] = search_meta['store_sub3']
                        item['search_string'] = search_meta['search_terms']

                        #Default matches to 1.0 and modify later
                        #item['search_matches'] = 1.0
                        #UPPER case product name for storage to make searching easier
                        prodname = product.xpath(
                            self.settings.product_name_xpath).extract()
                        if len(prodname) > 0:
                            item['product_name'] = prodname[0].upper().strip()
                            #print 'SPIDER :: sainsbury :: product_name',format(item['product_name'].encode('utf-8'))
                            # WARNING:  Prices format is much more complicated on Sainsburys
                            # pages, so we have to do multiple layers of extraction here to
                            # get the prices while we still have access to the XPaths etc.

                            price_block = product.xpath(
                                self.settings.raw_price_xpath)
                            raw_price_block = price_block[0]
                            vol_price_block = price_block[1]
                            #price_block[0]
                            #price_block[1]
                            #print('individual item prices ', raw_price_block)
                            #print('individual volume item prices ', vol_price_block)
                            #Extract a raw price
                            ppu_price = raw_price_block.xpath('text()')[0]
                            ppu_unit = raw_price_block.xpath(
                                '*/span[@class="pricePerUnitUnit"]/text()')[0]
                            item['item_price_str'] = ppu_price.extract().strip(
                            ) + '/' + ppu_unit.extract().strip()
                            #print('individual item prices processed', item['item_price_str'])
                            #Extract the components of the volume price e.g. 1.50 per 100g
                            #THIS WILL BREAK IF PRICE FORMAT ON PAGE CHANGES!
                            vol_abbr = vol_price_block.xpath(
                                'text()').extract()
                            #print('volume_unit_raw', vol_abbr )
                            if vol_abbr[0].strip():
                                vol_price = vol_abbr[0].strip()
                            if vol_abbr[1].strip():
                                vol_price = vol_price + ' / ' + vol_abbr[1]
                            else:
                                #default std quantity to 1
                                vol_price = vol_price + ' / 1 '
                            #Get the volume units as well

                            #exception added as the last two unit_vol's were not collecting, this adds an NA in when this is the case and parses to the next product
                            try:
                                vol_unit = product.xpath(
                                    self.settings.vol_unit)[2]
                                vol_price = vol_price + vol_unit.extract(
                                ).strip()
                            except:
                                #default std quantity to 1
                                vol_unit = "NA"
                                vol_price = vol_price + vol_unit
                            #Get the volume units as well
                            #print('vol_unit', vol_unit)
                            #print('vol _nunit', vol_unit)
                            #vol_price_block.xpath("*/span[@class='pricePerMeasureMeasure']/text()")
                            #Construct the vol price in known format and save it to the item
                            item['volume_price'] = vol_price
                            #print('vol _nunit',  item['volume_price'])
                            # Add timestamp
                            item['timestamp'] = datetime.datetime.now()

                            #Ignore promos/offers
                            item['promo'] = product.xpath(
                                self.settings.promo_xpath).extract()
                            item['offer'] = product.xpath(
                                self.settings.offer_xpath).extract()

                            #Pass the item back
                            yield item
                except NoSuchElementException:
                    #print 'Inside NoSuchElementException handling::: '
                    break
                except:
                    self.tb = traceback.format_exc()
                    log.msg(
                        "Spider: parse request :Inside Exception handling:::" +
                        self.tb,
                        level=log.DEBUG)
                    #print 'Inside Exception handling::: ',self.tb
                    break

    def spider_closed(self, spider):
        #print "--- %s seconds ---" % (time.time() - start_time))
        self.display.stop()
        self.driver.quit()
예제 #5
0
class TescoSpider(CrawlSpider):
    """TescoSpider
       ===========
       Main spider for crawling Tecso store website and searching for products.
       Settings for XPaths etc are supplied from SearchSettingsFactory below.
       Search parameters for products are supplied from TescoSearchTreeFactory.
       Spider yields TescoItem for each product line.
       Pipelines exist to post-process data and write it to CSV or MongoDB.
       """
    name = 'tesco'
    store = "TESCO"
    settings = TescoSearchSettings()
    output_dir = None

    def __init__(self, csv_file=None, *args, **kwargs):
        """Can provide name of input CSV file at runtime e.g.:
        
           scrapy crawl tesco -a csv_file=tesco_input.csv
           
           Input CSV file should be in supermarket_scraper/input directory. 
           If CSV file not specified, defaults to {name}_input.csv 
           e.g. tesco_input.csv.
           
           Output files are written to:
           
           supermarket_scraper/output/[spider name]
           
           Output directory MUST EXIST!
        """
        super(TescoSpider, self).__init__(*args, **kwargs)

        if csv_file:
            self.csv_file = csv_file
        else:
            self.csv_file = self.name + "_input.csv"

        # Get URL and XPath settings
        self.settings = SearchSettingsFactory.get_settings(self.store)
        # Get search parameters as tree
        self.search_factory = SearchTreeFactory(self.store, self.csv_file)
        # Set and check output directory
        self.output_dir = os.path.join('output', self.name)
        if not (os.path.isdir(self.output_dir)):
            raise TescoSpiderError("Invalid output directory: " +
                                   self.output_dir)

    def get_searches(self):
        """Returns a tree of searches."""
        if self.csv_file:
            log.msg("Spider: Fetching searches from " + self.csv_file,
                    level=log.DEBUG)
            return self.search_factory.get_csv_search_tree(
                self.settings.base_url)
        else:
            #Use some other source for target URLs - database?
            raise TescoSpiderError("Cannot find input file " + self.csv_file)

    def start_requests(self):
        """Generates crawler request for given base URL and parse results."""
        yield Request(url=self.settings.base_url, callback=self.parse_base)

    def parse_base(self, response):
        """Parse responses from base URL:
           Overrides Scrapy parser to parse each crawled response.
           Extracts search details from response.
           Looks for next layer of search data (sub 1).
           Yield new Request to fetch required sub-set of data."""
        sel = Selector(response)
        #Get list of searches as a NESTED TREE
        searches = self.get_searches()
        #Find first layer of subordinate data (via nav links)
        #Process each navigation item to find required sub-category
        sub_items = sel.xpath(self.settings.sub1_path)
        for item in sub_items:
            # Check each nav link for the required sub-category
            # Text is returned as a list of strings so join it into a single string
            link_text = ' '.join(item.xpath('text()').extract())
            # Check search tree i.e. children of top node will be sub1 entries
            for s in searches.children:
                if (link_text == s.name):
                    search_meta = s.as_dict()
                    link_ref = item.xpath('@href').extract()[0]
                    url = link_ref
                    #print("parse_base: Text matches so use URL:",url)
                    yield Request(url,
                                  meta=search_meta,
                                  callback=self.parse_sub1)

    def parse_sub1(self, response):
        """Parse responses from SUB1 URL:
           Overrides Scrapy parser to parse each crawled response.
           Extracts search details from response.
           Looks for next layer of search data (sub 2).
           Yield new Request to fetch required sub-set of data."""
        sel = Selector(response)
        #Find required subordinate data (nav links)
        sub_items = sel.xpath(self.settings.sub2_path)
        for item in sub_items:
            #Check each nav link for the required sub-category
            link_text = ' '.join(item.xpath('text()').extract())
            # Check search tree i.e. children of this node will be sub2 entries
            for s in response.meta['children']:
                #print("Sub 2: Checking link text:", link_text, "against", s['name'])
                if (link_text.encode('utf-16') == s['name'].encode('utf-16')):
                    search_meta = s
                    link_ref = item.xpath('@href').extract()[0]
                    url = link_ref
                    #print "parse_sub1: Found nav link link: ", url
                    yield Request(url,
                                  meta=search_meta,
                                  callback=self.parse_sub2)

    def parse_sub2(self, response):
        """Parse responses from SUB2 URL:
           Overrides Scrapy parser to parse each crawled response.
           Extracts search details from response.
           Looks for next layer of search data (sub 2).
           Yield new Request to fetch required sub-set of data."""
        sel = Selector(response)
        #Find required subordinate data (nav links)
        sub_items = sel.xpath(self.settings.sub3_path)
        for item in sub_items:
            #Check each nav link for the required sub-category
            link_text = ' '.join(item.xpath('text()').extract())
            # Check search tree i.e. children of this node will be sub3 entries
            for s in response.meta['children']:
                #print("Sub 3: Checking link text:", link_text, "against", s['name'])
                if (link_text.encode('utf-16') == s['name'].encode('utf-16')):
                    search_meta = s
                    link_ref = item.xpath('@href').extract()[0]
                    url = link_ref
                    #print "parse_sub2: Found nav link link: ", url
                    yield Request(url,
                                  meta=search_meta,
                                  callback=self.parse_sub3)

    def parse_sub3(self, response):
        """Parse responses from SUB3 URL:
           Overrides Scrapy parser to parse each crawled response.
           Extracts search details from response.
           Searches for required product within resutls for this sub-category.
           Yield a ProductItem for each product item extracted.
           Yield another request for any "next" page links."""
        sel = Selector(response)

        #Find any "next" links for paging and yield Request to next page

        next_page = sel.xpath(self.settings.next_page_xpath)
        for page in next_page:
            #Check each nav link for the required sub-category
            next_link_ref = page.xpath('@href').extract()[0]
            #print "Found nav link link: ", url
            yield Request(next_link_ref,
                          meta=response.meta,
                          callback=self.parse_sub3)

        #Finds product lines
        products = sel.xpath(self.settings.products_xpath)
        #Process each product line
        # Get details of current search (passed in via response meta data)
        metadata = response.meta['data']
        for product in products:
            #print('**in the item loop**')
            #print(product.xpath(self.settings.raw_price_xpath).extract()[0])
            # Create an item for each entry
            item = ProductItem()
            item['store'] = self.store
            item['ons_item_no'] = metadata['ons_item_no']
            item['ons_item_name'] = metadata['ons_item_name']
            item['product_type'] = metadata['store_sub3']
            item['search_string'] = metadata['search_terms']
            #Default matches to 1.0 and modify later
            #item['search_matches'] = 1.0
            #UPPER case product name for storage to make searching easier
            item['product_name'] = (product.xpath(
                self.settings.product_name_xpath).extract()[0]).upper()
            # Save price string and convert it to number later
            # need to account for parsing error
            try:
                item['item_price_str'] = product.xpath(
                    self.settings.raw_price_xpath).extract()[0]
            except:
                continue
            # Extract raw price by weight or volume
            try:
                item['volume_price'] = product.xpath(
                    self.settings.vol_price_xpath).extract()[0]
            except:
                continue
            # Add timestamp
            item['timestamp'] = datetime.datetime.now()
            # Get promotion text (if any)
            promo = product.xpath(self.settings.promo_xpath).extract()
            if promo:
                item['promo'] = promo[0]
            else:
                item['promo'] = ''
            # Get short term offer (if any)
            offer = product.xpath(self.settings.offer_xpath).extract()
            if offer:
                item['offer'] = offer[0]
            else:
                item['offer'] = ''
            #Pass the item back
            yield item
class SainsburySpider(CrawlSpider):
    """SainsburySpider
       ===========
       Main spider for crawling Tecso store website and searching for products.
       Settings for XPaths etc are supplied from SearchSettingsFactory below.
       Search parameters for products are supplied from sainsburySearchTreeFactory.
       Spider yields sainsburyItem for each product line.
       Pipelines exist to post-process data and write it to CSV or MongoDB.
       """
    name = 'sainsbury'
    store = "SAINSBURY"
    settings = SainsburySearchSettings()
    output_dir = None

    def __init__(self, csv_file=None, *args, **kwargs):
        """Can provide name of input CSV file at runtime e.g.:
        
           scrapy crawl sainsbury -a csv_file=sainsbury_input.csv
           
           Input CSV file should be in supermarket_scraper/input directory. 
           If CSV file not specified, defaults to {name}_input.csv 
           e.g. sainsbury_input.csv.
           
           Output files are written to:
           
           supermarket_scraper/output/[spider name]
           
           Output directory MUST EXIST!
        """
        super(SainsburySpider, self).__init__(*args, **kwargs)

        if csv_file:
            self.csv_file = csv_file
        else:
            self.csv_file = self.name + "_input.csv"

        # Get URL and XPath settings
        self.settings = SearchSettingsFactory.get_settings(self.store)
        # Get search parameters as tree
        self.search_factory = SearchTreeFactory(self.store, self.csv_file)
        # Set and check output directory
        self.output_dir = os.path.join('output', self.name)
        if not (os.path.isdir(self.output_dir)):
            raise SainsburySpiderError("Invalid output directory: " +
                                       self.output_dir)

    def get_searches(self):
        """Returns a LIST of searches. We don't need to nest searches here
           because Sainsbury website allows us to identify URLs directly,
           instead of having to navigate through several layers of menus."""
        if self.csv_file:
            log.msg("Spider: Fetching searches from " + self.csv_file,
                    level=log.DEBUG)
            return self.search_factory.get_csv_searches()
        else:
            #Use some other source for target URLs - database?
            raise SainsburySpiderError("Cannot find input file " +
                                       self.csv_file)

    def start_requests(self):
        """Generates crawler requests for given base URL and parses results."""
        search_list = self.get_searches()
        a = str(search_list)
        #print('gets urls and parses the repsonse objects to form a list: ', a)
        sb_cookies = self.settings.cookies
        # Build URLs based on base URL + sub-categories
        for s in search_list:
            search_meta = {}
            product_url = ''
            search_meta = s.get_meta_map()
            search_meta['cookiejar'] = 1
            product_url = s.store_sub3
            # print urls
            #print('product:' ,product_url)
            log.msg("Spider: start_requests() yielding URL: " + product_url,
                    level=log.DEBUG)
            yield Request(url=product_url,
                          cookies=sb_cookies,
                          meta=search_meta,
                          callback=self.parse_base)

    def parse_base(self, response):
        """Default function to parse responses from base URL:
           Waitrose serves products in a single list, but we cannot scroll
           through them and there is no 'Next page' link, so we just extract
           the first set of up to 24 product items and yield them for processing."""

        # Get details of current search (passed in via response meta data)
        metadata = response.meta
        #Find product lines
        sel = Selector(response)
        # rb test, only collecting three item (object) responses
        #print('test reponse objects from spider: ',sel)
        sb_cookies = self.settings.cookies
        #Find any "next" links for paging and yield Request to next page
        next_page = sel.xpath(self.settings.next_page_xpath)
        for page in next_page:
            #Check each nav link for the required sub-category
            next_link_ref = page.xpath('@href').extract()[0]
            log.msg("Spider: found NEXT page link: " + next_link_ref,
                    level=log.DEBUG)
            yield Request(next_link_ref,
                          cookies=sb_cookies,
                          meta=response.meta,
                          callback=self.parse_base)

        #Process each product line
        log.msg("Spider: parsing response for URL: " + response.url +
                " for ONS item " + metadata['ons_item_name'],
                level=log.DEBUG)
        products = sel.xpath(self.settings.products_xpath)

        for product in products:
            # Create an item for each entry
            item = ProductItem()
            item['store'] = self.store
            #print('store field of item object', item['store'])
            item['ons_item_no'] = metadata['ons_item_no']
            item['ons_item_name'] = metadata['ons_item_name']
            item['product_type'] = metadata['store_sub3']
            item['search_string'] = metadata['search_terms']

            #Default matches to 1.0 and modify later
            #item['search_matches'] = 1.0
            #UPPER case product name for storage to make searching easier
            prodname = product.xpath(
                self.settings.product_name_xpath).extract()
            if len(prodname) > 0:
                item['product_name'] = prodname[0].upper().strip()
                #print('individual item product names: ', item['product_name'])
                # WARNING:  Prices format is much more complicated on Sainsburys
                # pages, so we have to do multiple layers of extraction here to
                # get the prices while we still have access to the XPaths etc.

                price_block = product.xpath(self.settings.raw_price_xpath)
                raw_price_block = price_block[0]
                vol_price_block = price_block[1]
                #price_block[0]
                #price_block[1]
                #print('individual item prices ', raw_price_block)
                #print('individual volume item prices ', vol_price_block)
                #Extract a raw price
                ppu_price = raw_price_block.xpath('text()')[0]
                ppu_unit = raw_price_block.xpath(
                    '*/span[@class="pricePerUnitUnit"]/text()')[0]
                item['item_price_str'] = ppu_price.extract().strip(
                ) + '/' + ppu_unit.extract().strip()
                #print('individual item prices processed', item['item_price_str'])
                #Extract the components of the volume price e.g. 1.50 per 100g
                #THIS WILL BREAK IF PRICE FORMAT ON PAGE CHANGES!
                vol_abbr = vol_price_block.xpath('text()').extract()
                #print('volume_unit_raw', vol_abbr )
                if vol_abbr[0].strip():
                    vol_price = vol_abbr[0].strip()
                if vol_abbr[1].strip():
                    vol_price = vol_price + ' / ' + vol_abbr[1]
                else:
                    #default std quantity to 1
                    vol_price = vol_price + ' / 1 '
                #Get the volume units as well

                #exception added as the last two unit_vol's were not collecting, this adds an NA in when this is the case and parses to the next product
                try:
                    vol_unit = product.xpath(self.settings.vol_unit)[2]
                    vol_price = vol_price + vol_unit.extract().strip()
                except:
                    #default std quantity to 1
                    vol_unit = "NA"
                    vol_price = vol_price + vol_unit
                #Get the volume units as well
                #print('vol_unit', vol_unit)
                #print('vol _nunit', vol_unit)
                #vol_price_block.xpath("*/span[@class='pricePerMeasureMeasure']/text()")
                #Construct the vol price in known format and save it to the item
                item['volume_price'] = vol_price
                #print('vol _nunit',  item['volume_price'])
                # Add timestamp
                item['timestamp'] = datetime.datetime.now()

                #Ignore promos/offers
                item['promo'] = product.xpath(
                    self.settings.promo_xpath).extract()
                item['offer'] = product.xpath(
                    self.settings.offer_xpath).extract()

                #Pass the item back
                yield item
예제 #7
0
class WaitroseSpider(CrawlSpider):
    """WaitroseSpider
       ===========
       Main spider for crawling Waitrose website and searching for products.
       Settings for XPaths etc are supplied from SearchSettingsFactory below.
       Search parameters for products are supplied from SearchTreeFactory.
       Spider yields ProductItem for each product line.
       Pipelines exist to post-process data and write it to CSV or MongoDB.
       """
    name = 'waitrose'
    store = "WAITROSE"
    output_dir = None
    settings = WaitroseSearchSettings()

    def __init__(self, csv_file=None, *args, **kwargs):
        """Can provide name of input CSV file at runtime e.g.:
        
           scrapy crawl waitrose -a csv_file=waitrose_input.csv
           
           Input CSV file should be in data directory. 
           If CSV file not specified, defaults to {name}_input.csv 
           e.g. waitrose_input.csv.
           
           Output files are written to:
           
           supermarket_scraper/output/[spider name]
           
           Output directory MUST EXIST!
        """
        super(WaitroseSpider, self).__init__(*args, **kwargs)

        if csv_file:
            self.csv_file = csv_file
        else:
            self.csv_file = self.name + "_input.csv"

        # Get URL and XPath settings
        self.settings = SearchSettingsFactory.get_settings(self.store)
        # Get search parameters as tree
        self.search_factory = SearchTreeFactory(self.store, self.csv_file)
        # Set and check output directory
        self.output_dir = os.path.join('output', self.name)
        if not (os.path.isdir(self.output_dir)):
            raise WaitroseSpiderError("Invalid output directory: " +
                                      self.output_dir)

    def get_searches(self):
        """Returns a LIST of searches. We don't need to nest searches here
           because Waitrose website allows us to construct URLs directly,
           instead of having to navigate through several layers of menus."""
        if self.csv_file:
            log.msg("Spider: Fetching searches from " + self.csv_file,
                    level=log.DEBUG)
            return self.search_factory.get_csv_searches()
        else:
            #Use some other source for target URLs - database?
            raise WaitroseSpiderError("Cannot find input file " +
                                      self.csv_file)

    def start_requests(self):
        """Generates crawler requests for given base URL and parses results."""
        search_list = self.get_searches()
        # Build URLs based on base URL + sub-categories
        for s in search_list:
            search_meta = {}
            product_url = ''
            search_meta = s.get_meta_map()
            product_url = '/'.join([
                self.settings.base_url, s.store_sub1, s.store_sub2,
                s.store_sub3
            ]) + '/'
            log.msg("Spider: start_requests() yielding URL: " + product_url,
                    level=log.DEBUG)
            yield Request(url=product_url, meta=search_meta)

    def parse_start_url(self, response):
        """Default function to parse responses from base URL:
           Waitrose serves products in a single list, but we cannot scroll
           through them and there is no 'Next page' link, so we just extract
           the first set of up to 24 product items and yield them for processing."""

        # Get details of current search (passed in via response meta data)
        metadata = response.meta
        #Find product lines
        sel = Selector(response)
        products = sel.xpath(self.settings.products_xpath)
        #Process each product line
        log.msg("Spider: parsing response for URL: " + response.url +
                " for ONS item " + metadata['ons_item_name'],
                level=log.DEBUG)
        for product in products:
            # Create an item for each entry
            item = ProductItem()
            #UPPER case product name for storage to make searching easier
            try:
                item['product_name'] = (product.xpath(
                    self.settings.product_name_xpath).extract()[0]).upper()
            except:
                continue

            log.msg("Spider: Response for URL: " + response.url + " found " +
                    item['product_name'].encode('utf-16'),
                    level=log.DEBUG)
            try:
                item['store'] = self.store
                item['ons_item_no'] = metadata['ons_item_no']
                item['ons_item_name'] = metadata['ons_item_name']
                item['product_type'] = metadata['store_sub3']
                item['search_string'] = metadata['search_terms']

            except:
                continue
            #Default matches to 1.0 and modify later

            try:
                item['search_matches'] = 1.0
                # Save price string and convert it to number later
                item['item_price_str'] = product.xpath(
                    self.settings.raw_price_xpath).extract()[0].strip()
                x = item['item_price_str'][0]
                #print('test', x)
                #pos = item['item_price_str'].index('\xc2')
                #item['item_price_str'] = item['item_price_str'][:].strip()
                #print(item['item_price_str'][4])
                if item['item_price_str'][0] == 'N':
                    item['item_price_str'] = item['item_price_str'][3:].strip()
                else:
                    item['item_price_str'] = item['item_price_str'][:].strip()

            # Try getting the volume and putting it on the end of the product name
                volume = product.xpath(self.settings.volume_xpath).extract()
                if volume:
                    item['product_name'] = item['product_name'] + " " + volume[
                        0].strip().upper()
            except:
                continue

            # Waitrose volume price not always provided, so if it is not there,
            # we try using volume and item price instead.
            try:
                item['volume_price'] = ''
                vol_price = product.xpath(
                    self.settings.vol_price_xpath).extract()
                if vol_price:
                    #Allow for e.g. "1.25 per litre" instead of "1.25/litre"
                    item['volume_price'] = (vol_price[0].strip()).replace(
                        "per", "/")
                else:
                    item['volume_price'] = item[
                        'item_price_str'] + "/" + volume[0].strip()

                # Add timestamp
                item['timestamp'] = datetime.datetime.now()
                # Get promotion text (if any) NOT YET IMPLEMENTED
                item['promo'] = ''
                if self.settings.promo_xpath:
                    promo = product.xpath(
                        self.settings.promo_xpath).extract()  #TODO
                    if promo:
                        item['promo'] = promo[0]
                # Get short term offer (if any) NOT YET IMPLEMENTED
                item['offer'] = ''
                if self.settings.offer_xpath:
                    offer = product.xpath(
                        self.settings.offer_xpath).extract()  #TODO
                    if offer:
                        item['offer'] = offer[0]
            except:
                continue
            #Pass the item back
            yield item