def __init__(self): """ Construct new instance of the scraper task. Start it by calling `start`. """ cache = util.get_env_boolean("cache") initial_data = cache_file_read() if cache else Unset if util.get_env_boolean("s3_read") or util.get_env_boolean("s3_write"): s3 = boto3.resource("s3") else: s3 = Unset if initial_data is Unset and util.get_env_boolean("s3_read"): initial_data = s3_read(s3) webhook = Webhook(WEBHOOK_URL, WEBHOOK_TIMEOUT) util.log( "Starting worker (on-disk cache {}, S3 {})".format( "enabled" if cache else "disabled", "enabled" if s3 is not Unset else "disabled", ) ) super().__init__( lambda old_data: compute_data(s3, webhook, old_data), SCRAPER_REPEAT_DELAY, initial_data=initial_data, )
def try_compute_data(s3, webhook, old_data): """ Try to run the scraper and return course data. If something goes wrong, raise `ScrapeError`. Otherwise, invoke the provided `Webhook`. `old_data` is the previous course data or `util.Unset`. """ scraper_timeout = util.get_env("scraper_timeout") try: scraper_timeout = int(scraper_timeout) if scraper_timeout <= 0: raise ValueError except ValueError: util.warn("Illegal scraper timeout: {}".format(repr(scraper_timeout))) util.log("Resetting timeout to 60 seconds") os.environ["HYPERSCHEDULE_SCRAPER_TIMEOUT"] = "60" scraper_timeout = 60 if old_data is util.Unset: # For JSON. old_data = None try: util.log("Running scraper") process = subprocess.Popen( ["python", "-m", "hyperschedule.scrapers.claremont"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, ) output, _ = process.communicate(input=json.dumps(old_data).encode(), timeout=scraper_timeout) if process.returncode != 0: raise ScrapeError("scraper failed") try: output = output.decode() except UnicodeDecodeError as e: raise ScrapeError( "scraper emitted malformed output: {}".format(e)) from None if "$delete" in output: raise ScrapeError("scraper output contains '$delete'") data = json.loads(output) if util.get_env_boolean("snitch"): webhook.get() if util.get_env_boolean("cache"): cache_file_write(data) if util.get_env_boolean("s3_write"): s3_write(s3, data) except OSError as e: raise ScrapeError( "unexpected error while running scraper: {}".format(e)) from None except subprocess.TimeoutExpired: process.kill() process.communicate() raise ScrapeError("scraper timed out after {} seconds".format( scraper_timeout)) from None except json.decoder.JSONDecodeError: raise ScrapeError("scraper did not return valid JSON") from None except requests.exceptions.RequestException as e: util.warn("failed to reach success webhook: {}".format(e)) return data
def get_course_descriptions(): """ Given a Lingk API key and secret for authentication, return a dictionary mapping course codes (as can be used on the frontend) to course descriptions. Throw ScrapeError if the API is not available or returns bad data. """ if util.get_env_boolean("lingk"): key = os.environ.get("HYPERSCHEDULE_LINGK_KEY") secret = os.environ.get("HYPERSCHEDULE_LINGK_SECRET") if not key or not secret: util.log("Skipping Lingk as key and secret are not set") return {} util.log_verbose("Scraping Lingk API") data = get_lingk_api_data(key, secret) desc_index = lingk_api_data_to_course_descriptions(data) else: util.log_verbose("Scraping Lingk CSV") data = get_lingk_csv_data() desc_index = lingk_csv_data_to_course_descriptions(data) if len(desc_index) < 100: raise ScrapeError("Not enough course descriptions: {}".format( len(desc_index))) return desc_index
def get_browser(): """ Return a Selenium browser object. Whether it is headless is controlled by the 'headless' config var. """ if util.get_env_boolean("headless"): options = selenium.webdriver.chrome.options.Options() options.headless = True # Disabling scroll bars is important, see # <https://bugs.chromium.org/p/chromedriver/issues/detail?id=2487>. options.add_argument("--hide-scrollbars") # The Chrome binary is at a nonstandard location on Heroku, # see <https://github.com/heroku/heroku-buildpack-google-chrome>. binary = os.environ.get("GOOGLE_CHROME_SHIM") if binary: options.binary_location = binary return selenium.webdriver.Chrome(options=options) return selenium.webdriver.Chrome()
def kill_google_chrome(): """ Kill all currently running Google Chrome processes. This is important to save memory, and also to avoid a memory leak if Selenium does not shut down cleanly. If config var 'kill_orphans' is not enabled, do nothing. """ if not util.get_env_boolean("kill_orphans"): return for proc in psutil.process_iter(): # We have to kill the helpers, too -- on Heroku we are using # Docker without baseimage-docker and thus zombie children # don't get reaped correctly; see # <https://blog.phusion.nl/2015/01/20/docker-and-the-pid-1-zombie-reaping-problem/>. if re.match(r"chrome", proc.name(), re.IGNORECASE): util.log("Killing {} process {}".format(repr(proc.name()), proc.pid)) proc.kill()
metavar="key=val", nargs="*", help="config var settings (see README)") config_args = parser.parse_args().config config = {} for config_arg in config_args: if "=" not in config_arg: util.die("malformed key=val argument: {}".format(repr(config_arg))) var, val = config_arg.split("=", maxsplit=1) if var not in util.ENV_DEFAULTS: util.die("unknown config var: {}".format(repr(var))) config[var] = val for var, val in util.ENV_DEFAULTS.items(): if var not in config: config[var] = val val = config[var] env_var = "HYPERSCHEDULE_" + var.upper() os.environ[env_var] = val app = "hyperschedule.app:app" port = util.get_env("port") host = "0.0.0.0" if util.get_env_boolean("expose") else "127.0.0.1" if util.get_env_boolean("debug"): os.environ["FLASK_ENV"] = "development" os.environ["FLASK_APP"] = app os.environ["FLASK_SKIP_DOTENV"] = "1" exec_cmd( ["flask", "run", "--host", host, "--port", port, "--no-reload"]) else: exec_cmd( ["gunicorn", "-w", "1", "-b", "{}:{}".format(host, port), app])