def get_course_descriptions(): """ Given a Lingk API key and secret for authentication, return a dictionary mapping course codes (as can be used on the frontend) to course descriptions. Throw ScrapeError if the API is not available or returns bad data. """ if util.get_env_boolean("lingk"): key = os.environ.get("HYPERSCHEDULE_LINGK_KEY") secret = os.environ.get("HYPERSCHEDULE_LINGK_SECRET") if not key or not secret: util.log("Skipping Lingk as key and secret are not set") return {} util.log_verbose("Scraping Lingk API") data = get_lingk_api_data(key, secret) desc_index = lingk_api_data_to_course_descriptions(data) else: util.log_verbose("Scraping Lingk CSV") data = get_lingk_csv_data() desc_index = lingk_csv_data_to_course_descriptions(data) if len(desc_index) < 100: raise ScrapeError("Not enough course descriptions: {}".format( len(desc_index))) return desc_index
def __init__(self): """ Construct new instance of the scraper task. Start it by calling `start`. """ cache = util.get_env_boolean("cache") initial_data = cache_file_read() if cache else Unset if util.get_env_boolean("s3_read") or util.get_env_boolean("s3_write"): s3 = boto3.resource("s3") else: s3 = Unset if initial_data is Unset and util.get_env_boolean("s3_read"): initial_data = s3_read(s3) webhook = Webhook(WEBHOOK_URL, WEBHOOK_TIMEOUT) util.log( "Starting worker (on-disk cache {}, S3 {})".format( "enabled" if cache else "disabled", "enabled" if s3 is not Unset else "disabled", ) ) super().__init__( lambda old_data: compute_data(s3, webhook, old_data), SCRAPER_REPEAT_DELAY, initial_data=initial_data, )
def try_compute_data(s3, webhook, old_data): """ Try to run the scraper and return course data. If something goes wrong, raise `ScrapeError`. Otherwise, invoke the provided `Webhook`. `old_data` is the previous course data or `util.Unset`. """ scraper_timeout = util.get_env("scraper_timeout") try: scraper_timeout = int(scraper_timeout) if scraper_timeout <= 0: raise ValueError except ValueError: util.warn("Illegal scraper timeout: {}".format(repr(scraper_timeout))) util.log("Resetting timeout to 60 seconds") os.environ["HYPERSCHEDULE_SCRAPER_TIMEOUT"] = "60" scraper_timeout = 60 if old_data is util.Unset: # For JSON. old_data = None try: util.log("Running scraper") process = subprocess.Popen( ["python", "-m", "hyperschedule.scrapers.claremont"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, ) output, _ = process.communicate(input=json.dumps(old_data).encode(), timeout=scraper_timeout) if process.returncode != 0: raise ScrapeError("scraper failed") try: output = output.decode() except UnicodeDecodeError as e: raise ScrapeError( "scraper emitted malformed output: {}".format(e)) from None if "$delete" in output: raise ScrapeError("scraper output contains '$delete'") data = json.loads(output) if util.get_env_boolean("snitch"): webhook.get() if util.get_env_boolean("cache"): cache_file_write(data) if util.get_env_boolean("s3_write"): s3_write(s3, data) except OSError as e: raise ScrapeError( "unexpected error while running scraper: {}".format(e)) from None except subprocess.TimeoutExpired: process.kill() process.communicate() raise ScrapeError("scraper timed out after {} seconds".format( scraper_timeout)) from None except json.decoder.JSONDecodeError: raise ScrapeError("scraper did not return valid JSON") from None except requests.exceptions.RequestException as e: util.warn("failed to reach success webhook: {}".format(e)) return data
def kill_google_chrome(): """ Kill all currently running Google Chrome processes. This is important to save memory, and also to avoid a memory leak if Selenium does not shut down cleanly. If config var 'kill_orphans' is not enabled, do nothing. """ if not util.get_env_boolean("kill_orphans"): return for proc in psutil.process_iter(): # We have to kill the helpers, too -- on Heroku we are using # Docker without baseimage-docker and thus zombie children # don't get reaped correctly; see # <https://blog.phusion.nl/2015/01/20/docker-and-the-pid-1-zombie-reaping-problem/>. if re.match(r"chrome", proc.name(), re.IGNORECASE): util.log("Killing {} process {}".format(repr(proc.name()), proc.pid)) proc.kill()
def get_course_data(old_courses): """ Return data structure for the API given the list of old courses (or None). """ # Do this ahead of time (1) to save on memory, and (2) to avoid # messing up the connection pool when we kill later. kill_google_chrome() try: desc_index = lingk.get_course_descriptions() except ScrapeError: util.log("Got error while scraping Lingk:") traceback.print_exc() desc_index = {} if old_courses: util.log("Using previously scraped course descriptions") for course in old_courses.values(): desc = course["courseDescription"] if not desc: continue key = course_to_key(course) desc_index[key] = desc courses, term = portal.get_courses(desc_index) term_info = shared.parse_term_code(term) term_name = shared.term_info_as_display_name(term_info) term_sort_key = shared.term_info_as_list(term_info) return { "terms": { term: { "termCode": term, "termSortKey": term_sort_key, "termName": term_name, }, }, "courses": {course["courseCode"]: course for course in courses} }
def compute_data(s3, webhook, old_data): """ Try to run the scraper and return course data (see `try_compute_data`). If something goes wrong, log the error and return `util.Unset`. """ try: data = try_compute_data(s3, webhook, old_data) util.log("Scraper succeeded") return data except ScrapeError as e: util.log(str(e).capitalize()) return Unset except Exception: util.log("Unexpected error:") traceback.print_exc() return Unset