def parse_portal_html(html): """ Given the Portal search results HTML as a string, return a list of raw course data dictionaries. If HTML is bad, raise ScrapeError. """ soup = bs4.BeautifulSoup(html, "lxml") table = soup.find(id="tableCourses") if not table: raise ScrapeError("could not find course list table in Portal HTML") table_body = table.find("tbody") if not table_body: raise ScrapeError( "could not find course list table body in Portal HTML") table_rows = table_body.find_all("tr", recursive=False) if not table_rows: raise ScrapeError( "could not extract course list table rows from Portal HTML") raw_courses = [] for row_idx, row in enumerate(table_rows): if "style" in row.attrs and row.attrs["style"] == "display:none;": continue raw_courses.append(parse_table_row(row_idx, row)) return raw_courses
def lingk_csv_data_to_course_descriptions(data): header, *rows = data try: course_code_idx = header.index("courseNumber") desc_idx = header.index("description") except ValueError: raise ScrapeError(f"unexpected header: {header!r}") from None desc_map = {} for row in rows: # We have some rows that are completely empty and some that # are just whitespace. if not row or "".join(row).isspace(): continue if len(row) != len(header): raise ScrapeError(f"malformed row: {row!r}") course_code = row[course_code_idx] try: course_info = shared.parse_course_code(course_code, with_section=False) except ScrapeError: continue index_key = tuple( shared.course_info_as_list(course_info, with_section=False)) description = row[desc_idx] if not description: continue description = " ".join(description.split()) # If two conflicting descriptions for the same course code # (yep, it happens), pick the one that comes later :/ desc_map[index_key] = description if len(desc_map) < 100: raise ScrapeError( f"Not enough course descriptions: {len(desc_map)}") from None return desc_map
def parse_table_row(row_idx, row): """ Given a Selenium table row and the index, return a dictionary representing the raw course data for that row. Raise ScrapeError if the HTML does not have the desired data. """ elements = row.find_all("td") try: (_add, course_code, name, faculty, seats, status, schedule, num_credits, begin, end) = elements except ValueError: raise ScrapeError("could not extract course list table row elements " "from Portal HTML (for row {})".format(row_idx)) return { "course_code": course_code.text, "course_name": name.text, "faculty": faculty.text, "seats": seats.text, "status": status.text, "schedule": [stime.text for stime in schedule.find_all("li")], "credits": num_credits.text, "begin_date": begin.text, "end_date": end.text, }
def get_course_descriptions(): """ Given a Lingk API key and secret for authentication, return a dictionary mapping course codes (as can be used on the frontend) to course descriptions. Throw ScrapeError if the API is not available or returns bad data. """ if util.get_env_boolean("lingk"): key = os.environ.get("HYPERSCHEDULE_LINGK_KEY") secret = os.environ.get("HYPERSCHEDULE_LINGK_SECRET") if not key or not secret: util.log("Skipping Lingk as key and secret are not set") return {} util.log_verbose("Scraping Lingk API") data = get_lingk_api_data(key, secret) desc_index = lingk_api_data_to_course_descriptions(data) else: util.log_verbose("Scraping Lingk CSV") data = get_lingk_csv_data() desc_index = lingk_csv_data_to_course_descriptions(data) if len(desc_index) < 100: raise ScrapeError("Not enough course descriptions: {}".format( len(desc_index))) return desc_index
def get_lingk_api_data(key, secret): """ Return the decoded JSON response from the Lingk API, using the given key and secret for authentication. Throw ScrapeError if the API is not available or returns bad data. """ # For some bizarre reason the Lingk API sometimes returns 401 # Unauthorized even when you are authenticated correctly. Asking # again a few times fixes the issue. I don't even want to know # why. last_error = None fails = 0 for i in range(LINGK_RETRY_COUNT): now = datetime.datetime.utcnow() date = now.strftime("%a, %d %b %Y %H:%M:%S UTC") response = requests.get(LINGK_URL, headers={ "Date": date, "Authorization": get_auth_header(key, secret, date) }) try: response.raise_for_status() return response.json() except requests.exceptions.HTTPError as e: fails += 1 util.log_verbose( "Got auth error from Lingk API ({} of {} allowed)".format( fails, LINGK_RETRY_COUNT)) time.sleep(1) last_error = e continue except ValueError: raise ScrapeError("Lingk API returned no data") except json.decoder.JSONDecodeError: raise ScrapeError("Lingk API did not return valid JSON") raise ScrapeError( "Lingk API returned error response: {}".format(last_error))
def get_portal_html(browser): """ Given a Selenium browser object, perform a webscrape of Portal. Return a tuple (html, term) with the HTML of the course search results page as a string and the current term (for which courses were retrieved) also as a string. Raise ScrapeError if something goes wrong with the browser or Portal. """ util.log_verbose(f"Current scraper IP is {get_ip()}") util.log_verbose("Scraping Portal") browser.set_window_size(1920, 1200) url = ("https://portal.hmc.edu/ICS/Portal_Homepage.jnz?" "portlet=Course_Schedules&screen=Advanced+Course+Search" "&screenType=next") browser.get(url) term_dropdown = selenium.webdriver.support.ui.Select( browser.find_element_by_id("pg0_V_ddlTerm")) term_names = [option.text for option in term_dropdown.options] terms_info = [] for term_name in term_names: match = re.match(r"\s*(FA|SP)\s*([0-9]{4})\s*", term_name) if match: fall_or_spring, year_str = match.groups() terms_info.append( (int(year_str), fall_or_spring == "FA", term_name)) if not terms_info: raise ScrapeError( f"couldn't parse any term names (from: {term_names!r})") term_info = max(terms_info) term = term_info[2] term_dropdown.select_by_visible_text(term) title_input = browser.find_element_by_id("pg0_V_txtTitleRestrictor") title_input.clear() title_input.send_keys("?") search_button = browser.find_element_by_id("pg0_V_btnSearch") search_button.click() show_all_checkbox = browser.find_element_by_id("pg0_V_lnkShowAll") show_all_checkbox.click() return browser.page_source, " ".join(term.split())
def lingk_api_data_to_course_descriptions(data): """ Given the decoded JSON from the Lingk API, return a dictionary mapping tuples of course information (`with_section` false; see `shared.course_info_as_list`) to course descriptions. Throw ScrapeError if the data is malformed. """ if not isinstance(data, dict): raise ScrapeError("Lingk JSON is not map") if "data" not in data: raise ScrapeError("Lingk JSON is missing 'data' field") desc_index = {} for idx, course in enumerate(data["data"]): if "description" not in course: continue description = course["description"] if not isinstance(description, str): raise ScrapeError( "'description' at index {} is not string".format(idx)) if "courseNumber" not in course: raise ScrapeError( "Lingk JSON at index {} is missing 'courseNumber' field". format(idx)) course_code = course["courseNumber"] # Special case that doesn't show up on Portal. if course_code == "ABROAD HM": continue course_info = shared.parse_course_code(course_code, with_section=False) course_key = tuple( shared.course_info_as_list(course_info, with_section=False)) found_mismatch = (course_key in desc_index and desc_index[course_key] != description) if found_mismatch: raise ScrapeError("Lingk JSON has duplicate course: {}".format( repr(course_key))) desc_index[course_key] = description return desc_index
def try_compute_data(s3, webhook, old_data): """ Try to run the scraper and return course data. If something goes wrong, raise `ScrapeError`. Otherwise, invoke the provided `Webhook`. `old_data` is the previous course data or `util.Unset`. """ scraper_timeout = util.get_env("scraper_timeout") try: scraper_timeout = int(scraper_timeout) if scraper_timeout <= 0: raise ValueError except ValueError: util.warn("Illegal scraper timeout: {}".format(repr(scraper_timeout))) util.log("Resetting timeout to 60 seconds") os.environ["HYPERSCHEDULE_SCRAPER_TIMEOUT"] = "60" scraper_timeout = 60 if old_data is util.Unset: # For JSON. old_data = None try: util.log("Running scraper") process = subprocess.Popen( ["python", "-m", "hyperschedule.scrapers.claremont"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, ) output, _ = process.communicate(input=json.dumps(old_data).encode(), timeout=scraper_timeout) if process.returncode != 0: raise ScrapeError("scraper failed") try: output = output.decode() except UnicodeDecodeError as e: raise ScrapeError( "scraper emitted malformed output: {}".format(e)) from None if "$delete" in output: raise ScrapeError("scraper output contains '$delete'") data = json.loads(output) if util.get_env_boolean("snitch"): webhook.get() if util.get_env_boolean("cache"): cache_file_write(data) if util.get_env_boolean("s3_write"): s3_write(s3, data) except OSError as e: raise ScrapeError( "unexpected error while running scraper: {}".format(e)) from None except subprocess.TimeoutExpired: process.kill() process.communicate() raise ScrapeError("scraper timed out after {} seconds".format( scraper_timeout)) from None except json.decoder.JSONDecodeError: raise ScrapeError("scraper did not return valid JSON") from None except requests.exceptions.RequestException as e: util.warn("failed to reach success webhook: {}".format(e)) return data
def parse_table_row(row_idx, row): """ Given a Selenium table row and the index, return a dictionary representing the raw course data for that row. Raise ScrapeError if the HTML does not have the desired data. """ elements = row.find_all("td") try: ( _tb, _add, course_code, name, _req, _note, seats, status, faculty_and_schedule, num_credits, begin, end, ) = elements except ValueError: raise ScrapeError("could not extract course list table row elements " f"from Portal HTML (for row {row_idx})") all_faculty = [] schedule = [] for item in faculty_and_schedule.find_all("li"): try: faculty, meeting = item.text.split(" / ") # This list gets uniquified later. all_faculty.append(faculty) schedule.append(meeting) except ValueError: # No "/" separator, assumed to mean only schedule (no # faculty). schedule.append(item.text) return { "course_code": course_code.text, "course_name": name.text, "faculty": all_faculty, "seats": seats.text, "status": status.text, "schedule": schedule, "credits": num_credits.text, "begin_date": begin.text, "end_date": end.text, }
def parse_term_code(term): """ Given a term code (e.g. "FA 2018"), return a dictionary with keys: * year (integer) * fall (boolean) * spring (boolean) """ match = re.match(r"(FA|SP)\s*(20[0-9]{2})", term) if not match: raise ScrapeError("malformed term code: {}".format(repr(term))) return { "year": int(match.group(2)), "fall": match.group(1) == "FA", "spring": match.group(1) == "SP", }
def get_courses(desc_index): """ Return a tuple containing the list of course objects and the current term. Takes `desc_index` as returned by `lingk.get_course_descriptions`. """ browser = get_browser() html, term = get_portal_html(browser) # Save on memory. scraper.kill_google_chrome() # Count how many courses we add descriptions to, so we can fail if # there aren't enough. num_descs_added = 0 # Count how many courses we fail to parse, so we can fail if there # are too many. num_failed = 0 # Get the first round of raw courses from Portal. raw_courses_1 = parse_portal_html(html) # Add course descriptions to them, using the raw course codes. # Also collect the course codes into a dictionary so that we can # deduplicate them. raw_courses_2 = [] course_info_map = collections.defaultdict(list) for raw_course in raw_courses_1: try: course_code = raw_course["course_code"].strip() course_info = shared.parse_course_code(course_code, with_section=True) desc_key = tuple( shared.course_info_as_list(course_info, with_section=False)) desc = desc_index.get(desc_key) if desc: num_descs_added += 1 raw_course["course_description"] = desc course_info_map[frozendict.frozendict(course_info)].append( raw_course) except ScrapeError as err: util.log_verbose( f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})" ) num_failed += 1 continue raw_courses_2.append(raw_course) if num_descs_added < 100: raise ScrapeError( f"not enough course descriptions added: {num_descs_added}") # Deduplicate course codes. raw_courses_3 = [] for course_info, courses in course_info_map.items(): if len(courses) > 1: if course_info["course_code_suffix"]: util.log_verbose( f"Duplicate course with suffix ({len(courses)} copies): " f"{format_raw_course(courses[0])!r}") num_failed += len(courses) continue if len(courses) > len(string.ascii_uppercase): util.log_verbose( f"Duplicate course with too many copies ({len(courses)}): " f"{format_raw_course(courses[0])!r}") num_failed += len(courses) continue for course, letter in zip(courses, string.ascii_uppercase): course["course_code_suffix"] = letter raw_courses_3.extend(courses) raw_courses = raw_courses_3 courses = [] for raw_course in raw_courses: try: courses.append(process_course(raw_course, term)) except ScrapeError as err: util.log_verbose( f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})" ) num_failed += 1 if num_failed >= 10: raise ScrapeError(f"Too many malformed courses: {num_failed}") num_succeeded = len(raw_courses) - num_failed if num_succeeded < 500: raise ScrapeError(f"Not enough courses: {num_succeeded}") util.log_verbose( f"Added descriptions to {num_descs_added} out of {num_succeeded} courses" ) return courses, term
def process_course(raw_course, term): """ Turn a raw course object into something that the frontend can use. Return a dictionary. If the raw course object has invalid data, raise ScrapeError. """ course_code = raw_course["course_code"].strip() course_info = shared.parse_course_code(course_code, with_section=True) course_code = shared.course_info_as_string(course_info) sort_key = shared.course_info_as_list(course_info, with_section=True) mutual_exclusion_key = shared.course_info_as_list(course_info, with_section=False) course_name = raw_course["course_name"].strip() if not course_name: raise ScrapeError("empty string for course name") faculty = sorted(set(f.strip() for f in raw_course["faculty"])) if not faculty: raise ScrapeError("no faculty") for faculty_name in faculty: if not faculty_name: raise ScrapeError("faculty with empty name") try: # careful: "∕" (`chr(8725)`) != "/" (`chr(47)`) filled_seats, total_seats = map(int, raw_course["seats"].split("∕")) except ValueError as err: raise ScrapeError( f"malformed seat count: {raw_course['seats']!r} ({err})") if filled_seats < 0: raise ScrapeError(f"negative filled seat count: {filled_seats}") if total_seats < 0: raise ScrapeError(f"negative total seat count: {total_seats}") course_status = raw_course["status"].lower().strip() if course_status not in ("open", "closed", "reopened"): raise ScrapeError(f"unknown course status: {course_status!r}") begin_date = dateutil.parser.parse(raw_course["begin_date"]).date() end_date = dateutil.parser.parse(raw_course["end_date"]).date() # First half-semester courses start (spring) January 1 through # January 31 or (fall) July 15 through September 15. (For some # reason, MATH 30B in Fall 2017 is listed as starting August 8.) first_half = datetime.date( begin_date.year, 1, 1) < begin_date < datetime.date( begin_date.year, 1, 31) or datetime.date( begin_date.year, 7, 15) < begin_date < datetime.date( begin_date.year, 9, 15) # Second half-semester courses for the spring end May 1 through # May 31, but there's also frosh chem pt.II which just *has* to be # different by ending 2/3 of the way through the semester. So we # also count that by allowing April 1 through April 30. Sigh. Fall # courses end December 1 through December 31. second_half = datetime.date( end_date.year, 4, 1) < end_date < datetime.date(end_date.year, 5, 31) or datetime.date( end_date.year, 12, 1) < end_date < datetime.date( end_date.year, 12, 31) if first_half and second_half: term_count = 1 terms = [0] elif first_half and not second_half: term_count = 2 terms = [0] elif second_half and not first_half: term_count = 2 terms = [1] else: raise ScrapeError( f"weird date range " f"{begin.date.strftime('%F')}-{end_date.strftime('%F')}") schedule = [] for slot in raw_course["schedule"]: if re.match(r"To Be Arranged\xa00?0:00 ?- ?0?0:00 ?AM", slot): continue match = re.match(SCHEDULE_REGEX, slot) if not match: raise ScrapeError(f"malformed schedule slot: {slot!r}") days, start, end, location = match.groups() for day in days: if day not in DAYS_OF_WEEK: raise ScrapeError( f"unknown day of week {day!r} in schedule slot {slot!r}") days = "".join(sorted(set(days), key=DAYS_OF_WEEK.index)) if not days: raise ScrapeError(f"no days in schedule slot {slot!r}") if not (start.endswith("AM") or start.endswith("PM")): start += end[-2:] try: start = dateutil.parser.parse(start).time() except ValueError: raise ScrapeError( f"malformed start time {start!r} in schedule slot {slot!r}") try: end = dateutil.parser.parse(end).time() except ValueError: raise ScrapeError( f"malformed end time {end!r} in schedule slot {slot!r}") location = " ".join(location.strip().split()) if not location: raise ScrapeError("empty string for location") # Start using camelCase here because we are constructing # objects that will be returned from the API as JSON -- no # longer just intermediate objects private to this module. schedule.append({ "scheduleDays": days, "scheduleStartTime": start.strftime("%H:%M"), "scheduleEndTime": end.strftime("%H:%M"), "scheduleStartDate": begin_date.strftime("%Y-%m-%d"), "scheduleEndDate": end_date.strftime("%Y-%m-%d"), "scheduleTermCount": term_count, "scheduleTerms": terms, "scheduleLocation": location, }) if not schedule: schedule.append({ "scheduleDays": "", "scheduleStartTime": "00:00", "scheduleEndTime": "00:00", "scheduleStartDate": begin_date.strftime("%Y-%m-%d"), "scheduleEndDate": end_date.strftime("%Y-%m-%d"), "scheduleTermCount": term_count, "scheduleTerms": terms, "scheduleLocation": "N/A", }) schedule = unique_preserve_order(schedule) num_credits = raw_course["credits"] try: num_credits = float(num_credits) except ValueError: raise ScrapeError(f"malformed credit count: {num_credits!r}") if num_credits < 0.0: raise ScrapeError(f"negative credit count: {raw_course['credits']}") if "Colloquium" in course_name and num_credits == 0.0: num_credits = 0.5 elif re.match("PE ", course_code) and num_credits == 0.0: num_credits = 1.0 elif num_credits == 0.25: num_credits = 1.0 elif not re.search(r"HM-", course_code): num_credits *= 3.0 if num_credits == 9.0: num_credits = 3.0 num_credits = str(num_credits) course_description = raw_course["course_description"] # just urls for now - we could add ratings or would take again percentages later urls = [] for prof in faculty: a = RateMyProfAPI(teacher=prof) # scrape the info from RateMyProfessors site a.fetch_info() urls.append(a.get_url()) return { "courseCode": course_code, "courseName": course_name, "courseSortKey": sort_key, "courseMutualExclusionKey": mutual_exclusion_key, "courseDescription": course_description, "courseInstructors": faculty, "courseInstructorRMPpages": urls, "courseTerm": term, "courseSchedule": schedule, "courseCredits": num_credits, "courseSeatsTotal": total_seats, "courseSeatsFilled": filled_seats, "courseWaitlistLength": None, "courseEnrollmentStatus": course_status, }
def parse_course_code(course_code, with_section): """ Given a course code in the format used by Portal and Lingk, with or without a section number ("PHIL 179A HM-01" or just "PHIL179A HM") as controlled by `with_section`, parse it and return a dictionary with keys: - department (string) - course_number (integer) - course_code_suffix (string) - school (string) - section (integer, or null if with_section is false) The given course code may also be in the format returned by `course_info_as_string`. Throw ScrapeError if parsing fails. """ match = re.match(COURSE_REGEX, course_code) if not match: raise ScrapeError("malformed course code: {}".format( repr(course_code))) department, course_number, num_suffix, school, section = match.groups() if not department: raise ScrapeError("empty string for department") if "/" in department: raise ScrapeError("department contains slashes: {}".format( repr(department))) try: course_number = int(course_number) except ValueError: raise ScrapeError("malformed course number: {}".format( repr(course_number))) if course_number <= 0: raise ScrapeError( "non-positive course number: {}".format(course_number)) if "/" in num_suffix: raise ScrapeError("course code suffix contains slashes: {}".format( repr(num_suffix))) if not school: raise ScrapeError("empty string for school") if "/" in school: raise ScrapeError("school contains slashes: {}".format(repr(school))) if bool(section) != bool(with_section): if with_section: raise ScrapeError("section missing") else: raise ScrapeError("section unexpectedly present: {}".format( repr(section))) if section: try: section = int(section) except ValueError: raise ScrapeError("malformed section number: {}".format( repr(section))) if section <= 0: raise ScrapeError( "non-positive section number: {}".format(section)) # If section is None, just leave it as is. return { "department": department, "courseNumber": course_number, "courseCodeSuffix": num_suffix, "school": school, "section": section, }
def process_course(raw_course, term): """ Turn a raw course object into something that the frontend can use. Return a dictionary. If the raw course object has invalid data, raise ScrapeError. """ course_code = raw_course["course_code"].strip() course_info = shared.parse_course_code(course_code, with_section=True) course_code = shared.course_info_as_string(course_info) sort_key = shared.course_info_as_list(course_info, with_section=True) mutual_exclusion_key = shared.course_info_as_list(course_info, with_section=False) course_name = raw_course["course_name"].strip() if not course_name: raise ScrapeError("empty string for course name") faculty = sorted(set(re.split(r"\s*\n\s*", raw_course["faculty"].strip()))) if not faculty: raise ScrapeError("no faculty") for faculty_name in faculty: if not faculty_name: raise ScrapeError("faculty with empty name") match = re.match(r"([0-9]+)/([0-9]+)", raw_course["seats"]) if not match: raise ScrapeError("malformed seat count: {}".format( repr(raw_course["seats"]))) filled_seats, total_seats = map(int, match.groups()) if filled_seats < 0: raise ScrapeError( "negative filled seat count: {}".format(filled_seats)) if total_seats < 0: raise ScrapeError("negative total seat count: {}".format(total_seats)) course_status = raw_course["status"].lower() if course_status not in ("open", "closed", "reopened"): raise ScrapeError("unknown course status: {}".format( repr(course_status))) begin_date = dateutil.parser.parse(raw_course["begin_date"]).date() end_date = dateutil.parser.parse(raw_course["end_date"]).date() # First half-semester courses start (spring) January 1 through # January 31 or (fall) July 15 through September 15. (For some # reason, MATH 30B in Fall 2017 is listed as starting August 8.) first_half = (datetime.date(begin_date.year, 1, 1) < begin_date < datetime.date(begin_date.year, 1, 31) or datetime.date(begin_date.year, 7, 15) < begin_date < datetime.date(begin_date.year, 9, 15)) # Second half-semester courses for the spring end May 1 through # May 31, but there's also frosh chem pt.II which just *has* to be # different by ending 2/3 of the way through the semester. So we # also count that by allowing April 1 through April 30. Sigh. Fall # courses end December 1 through December 31. second_half = (datetime.date( end_date.year, 4, 1) < end_date < datetime.date(end_date.year, 5, 31) or datetime.date(end_date.year, 12, 1) < end_date < datetime.date(end_date.year, 12, 31)) if first_half and second_half: term_count = 1 terms = [0] elif first_half and not second_half: term_count = 2 terms = [0] elif second_half and not first_half: term_count = 2 terms = [1] else: raise ScrapeError("weird date range {}-{}".format( begin_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))) schedule = [] for slot in raw_course["schedule"]: if slot.startswith("0:00 - 0:00 AM"): continue match = re.match(SCHEDULE_REGEX, slot) if not match: raise ScrapeError("malformed schedule slot: {}".format(repr(slot))) days, start, end, location = match.groups() for day in days: if day not in DAYS_OF_WEEK: raise ScrapeError( "unknown day of week {} in schedule slot {}".format( repr(day), repr(slot))) days = "".join(sorted(set(days), key=DAYS_OF_WEEK.index)) if not days: raise ScrapeError("no days in schedule slot {}".format(repr(slot))) if not (start.endswith("AM") or start.endswith("PM")): start += end[-2:] try: start = dateutil.parser.parse(start).time() except ValueError: raise ScrapeError( "malformed start time {} in schedule slot {}".format( repr(start), repr(slot))) try: end = dateutil.parser.parse(end).time() except ValueError: raise ScrapeError( "malformed end time {} in schedule slot {}".format( repr(end), repr(slot))) location = " ".join(location.strip().split()) if not location: raise ScrapeError("empty string for location") # Start using camelCase here because we are constructing # objects that will be returned from the API as JSON -- no # longer just intermediate objects private to this module. schedule.append({ "scheduleDays": days, "scheduleStartTime": start.strftime("%H:%M"), "scheduleEndTime": end.strftime("%H:%M"), "scheduleStartDate": begin_date.strftime("%Y-%m-%d"), "scheduleEndDate": end_date.strftime("%Y-%m-%d"), "scheduleTermCount": term_count, "scheduleTerms": terms, "scheduleLocation": location, }) if not schedule: schedule.append({ "scheduleDays": "", "scheduleStartTime": "00:00", "scheduleEndTime": "00:00", "scheduleStartDate": begin_date.strftime("%Y-%m-%d"), "scheduleEndDate": end_date.strftime("%Y-%m-%d"), "scheduleTermCount": term_count, "scheduleTerms": terms, "scheduleLocation": "N/A", }) schedule = unique_preserve_order(schedule) num_credits = raw_course["credits"] try: num_credits = float(num_credits) except ValueError: raise ScrapeError("malformed credit count: {}".format( repr(num_credits))) if num_credits < 0.0: raise ScrapeError("negative credit count: {}".format( raw_course["credits"])) if "Colloquium" in course_name and num_credits == 0.0: num_credits = 0.5 elif re.match("PE ", course_code) and num_credits == 0.0: num_credits = 1.0 elif num_credits == 0.25: num_credits = 1.0 elif not re.search(r"HM-", course_code): num_credits *= 3.0 if num_credits == 9.0: num_credits = 3.0 num_credits = str(num_credits) course_description = raw_course["course_description"] return { "courseCode": course_code, "courseName": course_name, "courseSortKey": sort_key, "courseMutualExclusionKey": mutual_exclusion_key, "courseDescription": course_description, "courseInstructors": faculty, "courseTerm": term, "courseSchedule": schedule, "courseCredits": num_credits, "courseSeatsTotal": total_seats, "courseSeatsFilled": filled_seats, "courseWaitlistLength": None, "courseEnrollmentStatus": course_status, }