def get_course_descriptions(): """ Given a Lingk API key and secret for authentication, return a dictionary mapping course codes (as can be used on the frontend) to course descriptions. Throw ScrapeError if the API is not available or returns bad data. """ if util.get_env_boolean("lingk"): key = os.environ.get("HYPERSCHEDULE_LINGK_KEY") secret = os.environ.get("HYPERSCHEDULE_LINGK_SECRET") if not key or not secret: util.log("Skipping Lingk as key and secret are not set") return {} util.log_verbose("Scraping Lingk API") data = get_lingk_api_data(key, secret) desc_index = lingk_api_data_to_course_descriptions(data) else: util.log_verbose("Scraping Lingk CSV") data = get_lingk_csv_data() desc_index = lingk_csv_data_to_course_descriptions(data) if len(desc_index) < 100: raise ScrapeError("Not enough course descriptions: {}".format( len(desc_index))) return desc_index
def get_portal_html(browser): """ Given a Selenium browser object, perform a webscrape of Portal. Return a tuple (html, term) with the HTML of the course search results page as a string and the current term (for which courses were retrieved) also as a string. Raise ScrapeError if something goes wrong with the browser or Portal. """ util.log_verbose(f"Current scraper IP is {get_ip()}") util.log_verbose("Scraping Portal") browser.set_window_size(1920, 1200) url = ("https://portal.hmc.edu/ICS/Portal_Homepage.jnz?" "portlet=Course_Schedules&screen=Advanced+Course+Search" "&screenType=next") browser.get(url) term_dropdown = selenium.webdriver.support.ui.Select( browser.find_element_by_id("pg0_V_ddlTerm")) term_names = [option.text for option in term_dropdown.options] terms_info = [] for term_name in term_names: match = re.match(r"\s*(FA|SP)\s*([0-9]{4})\s*", term_name) if match: fall_or_spring, year_str = match.groups() terms_info.append( (int(year_str), fall_or_spring == "FA", term_name)) if not terms_info: raise ScrapeError( f"couldn't parse any term names (from: {term_names!r})") term_info = max(terms_info) term = term_info[2] term_dropdown.select_by_visible_text(term) title_input = browser.find_element_by_id("pg0_V_txtTitleRestrictor") title_input.clear() title_input.send_keys("?") search_button = browser.find_element_by_id("pg0_V_btnSearch") search_button.click() show_all_checkbox = browser.find_element_by_id("pg0_V_lnkShowAll") show_all_checkbox.click() return browser.page_source, " ".join(term.split())
def get_lingk_api_data(key, secret): """ Return the decoded JSON response from the Lingk API, using the given key and secret for authentication. Throw ScrapeError if the API is not available or returns bad data. """ # For some bizarre reason the Lingk API sometimes returns 401 # Unauthorized even when you are authenticated correctly. Asking # again a few times fixes the issue. I don't even want to know # why. last_error = None fails = 0 for i in range(LINGK_RETRY_COUNT): now = datetime.datetime.utcnow() date = now.strftime("%a, %d %b %Y %H:%M:%S UTC") response = requests.get(LINGK_URL, headers={ "Date": date, "Authorization": get_auth_header(key, secret, date) }) try: response.raise_for_status() return response.json() except requests.exceptions.HTTPError as e: fails += 1 util.log_verbose( "Got auth error from Lingk API ({} of {} allowed)".format( fails, LINGK_RETRY_COUNT)) time.sleep(1) last_error = e continue except ValueError: raise ScrapeError("Lingk API returned no data") except json.decoder.JSONDecodeError: raise ScrapeError("Lingk API did not return valid JSON") raise ScrapeError( "Lingk API returned error response: {}".format(last_error))
def get_courses(desc_index): """ Return a tuple containing the list of course objects and the current term. Takes `desc_index` as returned by `lingk.get_course_descriptions`. """ browser = get_browser() html, term = get_portal_html(browser) # Save on memory. scraper.kill_google_chrome() # Count how many courses we add descriptions to, so we can fail if # there aren't enough. num_descs_added = 0 # Count how many courses we fail to parse, so we can fail if there # are too many. num_failed = 0 # Get the first round of raw courses from Portal. raw_courses_1 = parse_portal_html(html) # Add course descriptions to them, using the raw course codes. # Also collect the course codes into a dictionary so that we can # deduplicate them. raw_courses_2 = [] course_info_map = collections.defaultdict(list) for raw_course in raw_courses_1: try: course_code = raw_course["course_code"].strip() course_info = shared.parse_course_code(course_code, with_section=True) desc_key = tuple( shared.course_info_as_list(course_info, with_section=False)) desc = desc_index.get(desc_key) if desc: num_descs_added += 1 raw_course["course_description"] = desc course_info_map[frozendict.frozendict(course_info)].append( raw_course) except ScrapeError as err: util.log_verbose( f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})" ) num_failed += 1 continue raw_courses_2.append(raw_course) if num_descs_added < 100: raise ScrapeError( f"not enough course descriptions added: {num_descs_added}") # Deduplicate course codes. raw_courses_3 = [] for course_info, courses in course_info_map.items(): if len(courses) > 1: if course_info["course_code_suffix"]: util.log_verbose( f"Duplicate course with suffix ({len(courses)} copies): " f"{format_raw_course(courses[0])!r}") num_failed += len(courses) continue if len(courses) > len(string.ascii_uppercase): util.log_verbose( f"Duplicate course with too many copies ({len(courses)}): " f"{format_raw_course(courses[0])!r}") num_failed += len(courses) continue for course, letter in zip(courses, string.ascii_uppercase): course["course_code_suffix"] = letter raw_courses_3.extend(courses) raw_courses = raw_courses_3 courses = [] for raw_course in raw_courses: try: courses.append(process_course(raw_course, term)) except ScrapeError as err: util.log_verbose( f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})" ) num_failed += 1 if num_failed >= 10: raise ScrapeError(f"Too many malformed courses: {num_failed}") num_succeeded = len(raw_courses) - num_failed if num_succeeded < 500: raise ScrapeError(f"Not enough courses: {num_succeeded}") util.log_verbose( f"Added descriptions to {num_descs_added} out of {num_succeeded} courses" ) return courses, term