def get_course_descriptions():
    """
    Given a Lingk API key and secret for authentication, return a
    dictionary mapping course codes (as can be used on the frontend)
    to course descriptions.

    Throw ScrapeError if the API is not available or returns bad data.
    """
    if util.get_env_boolean("lingk"):
        key = os.environ.get("HYPERSCHEDULE_LINGK_KEY")
        secret = os.environ.get("HYPERSCHEDULE_LINGK_SECRET")
        if not key or not secret:
            util.log("Skipping Lingk as key and secret are not set")
            return {}
        util.log_verbose("Scraping Lingk API")
        data = get_lingk_api_data(key, secret)
        desc_index = lingk_api_data_to_course_descriptions(data)
    else:
        util.log_verbose("Scraping Lingk CSV")
        data = get_lingk_csv_data()
        desc_index = lingk_csv_data_to_course_descriptions(data)
    if len(desc_index) < 100:
        raise ScrapeError("Not enough course descriptions: {}".format(
            len(desc_index)))
    return desc_index
示例#2
0
 def __init__(self):
     """
     Construct new instance of the scraper task. Start it by calling
     `start`.
     """
     cache = util.get_env_boolean("cache")
     initial_data = cache_file_read() if cache else Unset
     if util.get_env_boolean("s3_read") or util.get_env_boolean("s3_write"):
         s3 = boto3.resource("s3")
     else:
         s3 = Unset
     if initial_data is Unset and util.get_env_boolean("s3_read"):
         initial_data = s3_read(s3)
     webhook = Webhook(WEBHOOK_URL, WEBHOOK_TIMEOUT)
     util.log(
         "Starting worker (on-disk cache {}, S3 {})".format(
             "enabled" if cache else "disabled",
             "enabled" if s3 is not Unset else "disabled",
         )
     )
     super().__init__(
         lambda old_data: compute_data(s3, webhook, old_data),
         SCRAPER_REPEAT_DELAY,
         initial_data=initial_data,
     )
示例#3
0
def try_compute_data(s3, webhook, old_data):
    """
    Try to run the scraper and return course data. If something goes
    wrong, raise `ScrapeError`. Otherwise, invoke the provided
    `Webhook`. `old_data` is the previous course data or `util.Unset`.
    """
    scraper_timeout = util.get_env("scraper_timeout")
    try:
        scraper_timeout = int(scraper_timeout)
        if scraper_timeout <= 0:
            raise ValueError
    except ValueError:
        util.warn("Illegal scraper timeout: {}".format(repr(scraper_timeout)))
        util.log("Resetting timeout to 60 seconds")
        os.environ["HYPERSCHEDULE_SCRAPER_TIMEOUT"] = "60"
        scraper_timeout = 60
    if old_data is util.Unset:
        # For JSON.
        old_data = None
    try:
        util.log("Running scraper")
        process = subprocess.Popen(
            ["python", "-m", "hyperschedule.scrapers.claremont"],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
        )
        output, _ = process.communicate(input=json.dumps(old_data).encode(),
                                        timeout=scraper_timeout)
        if process.returncode != 0:
            raise ScrapeError("scraper failed")
        try:
            output = output.decode()
        except UnicodeDecodeError as e:
            raise ScrapeError(
                "scraper emitted malformed output: {}".format(e)) from None
        if "$delete" in output:
            raise ScrapeError("scraper output contains '$delete'")
        data = json.loads(output)
        if util.get_env_boolean("snitch"):
            webhook.get()
        if util.get_env_boolean("cache"):
            cache_file_write(data)
        if util.get_env_boolean("s3_write"):
            s3_write(s3, data)
    except OSError as e:
        raise ScrapeError(
            "unexpected error while running scraper: {}".format(e)) from None
    except subprocess.TimeoutExpired:
        process.kill()
        process.communicate()
        raise ScrapeError("scraper timed out after {} seconds".format(
            scraper_timeout)) from None
    except json.decoder.JSONDecodeError:
        raise ScrapeError("scraper did not return valid JSON") from None
    except requests.exceptions.RequestException as e:
        util.warn("failed to reach success webhook: {}".format(e))
    return data
示例#4
0
def kill_google_chrome():
    """
    Kill all currently running Google Chrome processes. This is
    important to save memory, and also to avoid a memory leak if
    Selenium does not shut down cleanly.

    If config var 'kill_orphans' is not enabled, do nothing.
    """
    if not util.get_env_boolean("kill_orphans"):
        return
    for proc in psutil.process_iter():
        # We have to kill the helpers, too -- on Heroku we are using
        # Docker without baseimage-docker and thus zombie children
        # don't get reaped correctly; see
        # <https://blog.phusion.nl/2015/01/20/docker-and-the-pid-1-zombie-reaping-problem/>.
        if re.match(r"chrome", proc.name(), re.IGNORECASE):
            util.log("Killing {} process {}".format(repr(proc.name()),
                                                    proc.pid))
            proc.kill()
示例#5
0
def get_course_data(old_courses):
    """
    Return data structure for the API given the list of old courses
    (or None).
    """
    # Do this ahead of time (1) to save on memory, and (2) to avoid
    # messing up the connection pool when we kill later.
    kill_google_chrome()
    try:
        desc_index = lingk.get_course_descriptions()
    except ScrapeError:
        util.log("Got error while scraping Lingk:")
        traceback.print_exc()
        desc_index = {}
        if old_courses:
            util.log("Using previously scraped course descriptions")
            for course in old_courses.values():
                desc = course["courseDescription"]
                if not desc:
                    continue
                key = course_to_key(course)
                desc_index[key] = desc
    courses, term = portal.get_courses(desc_index)
    term_info = shared.parse_term_code(term)
    term_name = shared.term_info_as_display_name(term_info)
    term_sort_key = shared.term_info_as_list(term_info)
    return {
        "terms": {
            term: {
                "termCode": term,
                "termSortKey": term_sort_key,
                "termName": term_name,
            },
        },
        "courses": {course["courseCode"]: course
                    for course in courses}
    }
示例#6
0
def compute_data(s3, webhook, old_data):
    """
    Try to run the scraper and return course data (see
    `try_compute_data`). If something goes wrong, log the error and
    return `util.Unset`.
    """
    try:
        data = try_compute_data(s3, webhook, old_data)
        util.log("Scraper succeeded")
        return data
    except ScrapeError as e:
        util.log(str(e).capitalize())
        return Unset
    except Exception:
        util.log("Unexpected error:")
        traceback.print_exc()
        return Unset