def setup_crawler(self, supermarket, reactor_control): """Set up the Scrapy crawler. See http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script. Keyword arguments: supermarket -- the supermarket whose crawler should be set up """ cachefile = supermarket_filename(supermarket) if isfile(cachefile): remove(cachefile) settings = get_project_settings() url = supermarket_url(supermarket) settings.set('FEED_URI', supermarket_filename(supermarket)) spider = MySupermarketSpider(url) crawler = Crawler(settings) crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() reactor_control.add_crawler()
def cache_exists(self, supermarket): """Check whether a JSON file already exists for data scraped from the given supermarket, and if so, whether it was created today. Note that 'created today' is not the same as 'age < 24 hours'. Prices are assumed to change overnight so a cachefile created at 9pm yesterday is considered out of date at 9am today (but a cachefile created at 9am is not out of date at 9pm). Keyword arguments: supermarket -- the supermarket whose cachefile should be checked """ cachefile = supermarket_filename(supermarket) if not isfile(cachefile): return False mtime = datetime.fromtimestamp(getmtime(cachefile)) now = datetime.fromtimestamp(time()) return mtime.day == now.day
def run(): """Main method. Check which supermarkets were requested, create a scraper, then search the scraped data. """ (options, args) = parse_args() if (options.all): supermarkets = supermarket_names() else: supermarkets = [options.supermarket] scraper = CachingScraper(supermarkets, options.force_refresh) log.start() scraper.get_data() search_phrases = [] for line in fileinput.input(args): search_phrases.append(line.split()) for supermarket in supermarkets: log.msg("*** Savvy buys in %s ***" % supermarket.upper()) search_file(search_phrases, supermarket_filename(supermarket))