예제 #1
0
 def __init__(self):
     """
     Construct new instance of the scraper task. Start it by calling
     `start`.
     """
     cache = util.get_env_boolean("cache")
     initial_data = cache_file_read() if cache else Unset
     if util.get_env_boolean("s3_read") or util.get_env_boolean("s3_write"):
         s3 = boto3.resource("s3")
     else:
         s3 = Unset
     if initial_data is Unset and util.get_env_boolean("s3_read"):
         initial_data = s3_read(s3)
     webhook = Webhook(WEBHOOK_URL, WEBHOOK_TIMEOUT)
     util.log(
         "Starting worker (on-disk cache {}, S3 {})".format(
             "enabled" if cache else "disabled",
             "enabled" if s3 is not Unset else "disabled",
         )
     )
     super().__init__(
         lambda old_data: compute_data(s3, webhook, old_data),
         SCRAPER_REPEAT_DELAY,
         initial_data=initial_data,
     )
예제 #2
0
def try_compute_data(s3, webhook, old_data):
    """
    Try to run the scraper and return course data. If something goes
    wrong, raise `ScrapeError`. Otherwise, invoke the provided
    `Webhook`. `old_data` is the previous course data or `util.Unset`.
    """
    scraper_timeout = util.get_env("scraper_timeout")
    try:
        scraper_timeout = int(scraper_timeout)
        if scraper_timeout <= 0:
            raise ValueError
    except ValueError:
        util.warn("Illegal scraper timeout: {}".format(repr(scraper_timeout)))
        util.log("Resetting timeout to 60 seconds")
        os.environ["HYPERSCHEDULE_SCRAPER_TIMEOUT"] = "60"
        scraper_timeout = 60
    if old_data is util.Unset:
        # For JSON.
        old_data = None
    try:
        util.log("Running scraper")
        process = subprocess.Popen(
            ["python", "-m", "hyperschedule.scrapers.claremont"],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
        )
        output, _ = process.communicate(input=json.dumps(old_data).encode(),
                                        timeout=scraper_timeout)
        if process.returncode != 0:
            raise ScrapeError("scraper failed")
        try:
            output = output.decode()
        except UnicodeDecodeError as e:
            raise ScrapeError(
                "scraper emitted malformed output: {}".format(e)) from None
        if "$delete" in output:
            raise ScrapeError("scraper output contains '$delete'")
        data = json.loads(output)
        if util.get_env_boolean("snitch"):
            webhook.get()
        if util.get_env_boolean("cache"):
            cache_file_write(data)
        if util.get_env_boolean("s3_write"):
            s3_write(s3, data)
    except OSError as e:
        raise ScrapeError(
            "unexpected error while running scraper: {}".format(e)) from None
    except subprocess.TimeoutExpired:
        process.kill()
        process.communicate()
        raise ScrapeError("scraper timed out after {} seconds".format(
            scraper_timeout)) from None
    except json.decoder.JSONDecodeError:
        raise ScrapeError("scraper did not return valid JSON") from None
    except requests.exceptions.RequestException as e:
        util.warn("failed to reach success webhook: {}".format(e))
    return data
예제 #3
0
def get_course_descriptions():
    """
    Given a Lingk API key and secret for authentication, return a
    dictionary mapping course codes (as can be used on the frontend)
    to course descriptions.

    Throw ScrapeError if the API is not available or returns bad data.
    """
    if util.get_env_boolean("lingk"):
        key = os.environ.get("HYPERSCHEDULE_LINGK_KEY")
        secret = os.environ.get("HYPERSCHEDULE_LINGK_SECRET")
        if not key or not secret:
            util.log("Skipping Lingk as key and secret are not set")
            return {}
        util.log_verbose("Scraping Lingk API")
        data = get_lingk_api_data(key, secret)
        desc_index = lingk_api_data_to_course_descriptions(data)
    else:
        util.log_verbose("Scraping Lingk CSV")
        data = get_lingk_csv_data()
        desc_index = lingk_csv_data_to_course_descriptions(data)
    if len(desc_index) < 100:
        raise ScrapeError("Not enough course descriptions: {}".format(
            len(desc_index)))
    return desc_index
예제 #4
0
def get_browser():
    """
    Return a Selenium browser object. Whether it is headless is
    controlled by the 'headless' config var.
    """
    if util.get_env_boolean("headless"):
        options = selenium.webdriver.chrome.options.Options()
        options.headless = True
        # Disabling scroll bars is important, see
        # <https://bugs.chromium.org/p/chromedriver/issues/detail?id=2487>.
        options.add_argument("--hide-scrollbars")
        # The Chrome binary is at a nonstandard location on Heroku,
        # see <https://github.com/heroku/heroku-buildpack-google-chrome>.
        binary = os.environ.get("GOOGLE_CHROME_SHIM")
        if binary:
            options.binary_location = binary
        return selenium.webdriver.Chrome(options=options)
    return selenium.webdriver.Chrome()
예제 #5
0
def kill_google_chrome():
    """
    Kill all currently running Google Chrome processes. This is
    important to save memory, and also to avoid a memory leak if
    Selenium does not shut down cleanly.

    If config var 'kill_orphans' is not enabled, do nothing.
    """
    if not util.get_env_boolean("kill_orphans"):
        return
    for proc in psutil.process_iter():
        # We have to kill the helpers, too -- on Heroku we are using
        # Docker without baseimage-docker and thus zombie children
        # don't get reaped correctly; see
        # <https://blog.phusion.nl/2015/01/20/docker-and-the-pid-1-zombie-reaping-problem/>.
        if re.match(r"chrome", proc.name(), re.IGNORECASE):
            util.log("Killing {} process {}".format(repr(proc.name()),
                                                    proc.pid))
            proc.kill()
예제 #6
0
                        metavar="key=val",
                        nargs="*",
                        help="config var settings (see README)")
    config_args = parser.parse_args().config
    config = {}
    for config_arg in config_args:
        if "=" not in config_arg:
            util.die("malformed key=val argument: {}".format(repr(config_arg)))
        var, val = config_arg.split("=", maxsplit=1)
        if var not in util.ENV_DEFAULTS:
            util.die("unknown config var: {}".format(repr(var)))
        config[var] = val
    for var, val in util.ENV_DEFAULTS.items():
        if var not in config:
            config[var] = val
        val = config[var]
        env_var = "HYPERSCHEDULE_" + var.upper()
        os.environ[env_var] = val
    app = "hyperschedule.app:app"
    port = util.get_env("port")
    host = "0.0.0.0" if util.get_env_boolean("expose") else "127.0.0.1"
    if util.get_env_boolean("debug"):
        os.environ["FLASK_ENV"] = "development"
        os.environ["FLASK_APP"] = app
        os.environ["FLASK_SKIP_DOTENV"] = "1"
        exec_cmd(
            ["flask", "run", "--host", host, "--port", port, "--no-reload"])
    else:
        exec_cmd(
            ["gunicorn", "-w", "1", "-b", "{}:{}".format(host, port), app])