def main(): logging.info('Initializing Scrapers') initialized_scrapers = [] for name, ad in settings.ads.items(): try: scraperClass = scrapers.get_scraper(ad["url"]) except ValueError as e: #If the url is not compatible with any scraper logging.warning(e) scraper = scraperClass(ad["url"], **ad["filters"]) initialized_scrapers.append(scraper) logging.info('Starting Loop') while True: ads = [] for scraper in initialized_scrapers: ads += scraper.scrape() if mailer.mail_ads(ads): #Only dump if mail is actually sent for scraper in initialized_scrapers: scraper.dump_ids() logging.info('Hinernating for {} seconds'.format( str(settings.SLEEP_SECONDS))) sleep(settings.SLEEP_SECONDS)
def fetch_resource(resource_type): args = Args(resource_type) db_manager = DbManager(args) scraper = get_scraper(args) if args.refetch: db_manager.delete_resource(args.db_key) if not db_manager.resource_exists(args.db_key): resource_data = scraper.get_resource(args.query_params) if scraper.driver: scraper.driver.quit() db_manager.save_resource(args.db_key, resource_data) return db_manager.fetch_resource(args.db_key)
def get(self, provider): scraper_cls = get_scraper(provider) if not scraper_cls: self.set_status(404) self.write({ "error": "Unkown provider", }) return scraper = scraper_cls() results = yield scraper.run() self.write({ "results": [r.serialize() for r in results], })
def get(self, provider): scraper_cls = get_scraper(provider) if not scraper_cls: self.set_status(400) self.write({ "error": "Unkown provider", }) return scraper = scraper_cls() results = yield scraper.run() self.write({ "results": results, })
def check_availability(item, logger=None): stock = None price = None r = requests.get(item["url"], headers={'User-Agent': 'Mozilla/5.0'}) if r.status_code != 200: if logger: logging.warning("Got {} status code in {}".format( r.status_code, item["url"])) return stock, price webpage = r.text soup = BeautifulSoup(webpage, 'html.parser') scraper = get_scraper(item["store"]) stock, price = scraper(soup) return stock, price