def lingk_csv_data_to_course_descriptions(data): header, *rows = data try: course_code_idx = header.index("courseNumber") desc_idx = header.index("description") except ValueError: raise ScrapeError(f"unexpected header: {header!r}") from None desc_map = {} for row in rows: # We have some rows that are completely empty and some that # are just whitespace. if not row or "".join(row).isspace(): continue if len(row) != len(header): raise ScrapeError(f"malformed row: {row!r}") course_code = row[course_code_idx] try: course_info = shared.parse_course_code(course_code, with_section=False) except ScrapeError: continue index_key = tuple( shared.course_info_as_list(course_info, with_section=False)) description = row[desc_idx] if not description: continue description = " ".join(description.split()) # If two conflicting descriptions for the same course code # (yep, it happens), pick the one that comes later :/ desc_map[index_key] = description if len(desc_map) < 100: raise ScrapeError( f"Not enough course descriptions: {len(desc_map)}") from None return desc_map
def course_to_key(course): """ Given a course object, return a tuple that can be used to index into the course description dictionary returned by `lingk.get_course_descriptions`. """ course_info = shared.parse_course_code(course["courseCode"], with_section=False) return tuple(shared.course_info_as_list(course_info, with_section=False))
def lingk_api_data_to_course_descriptions(data): """ Given the decoded JSON from the Lingk API, return a dictionary mapping tuples of course information (`with_section` false; see `shared.course_info_as_list`) to course descriptions. Throw ScrapeError if the data is malformed. """ if not isinstance(data, dict): raise ScrapeError("Lingk JSON is not map") if "data" not in data: raise ScrapeError("Lingk JSON is missing 'data' field") desc_index = {} for idx, course in enumerate(data["data"]): if "description" not in course: continue description = course["description"] if not isinstance(description, str): raise ScrapeError( "'description' at index {} is not string".format(idx)) if "courseNumber" not in course: raise ScrapeError( "Lingk JSON at index {} is missing 'courseNumber' field". format(idx)) course_code = course["courseNumber"] # Special case that doesn't show up on Portal. if course_code == "ABROAD HM": continue course_info = shared.parse_course_code(course_code, with_section=False) course_key = tuple( shared.course_info_as_list(course_info, with_section=False)) found_mismatch = (course_key in desc_index and desc_index[course_key] != description) if found_mismatch: raise ScrapeError("Lingk JSON has duplicate course: {}".format( repr(course_key))) desc_index[course_key] = description return desc_index
def get_courses(desc_index): """ Return a tuple containing the list of course objects and the current term. Takes `desc_index` as returned by `lingk.get_course_descriptions`. """ browser = get_browser() html, term = get_portal_html(browser) # Save on memory. scraper.kill_google_chrome() # Count how many courses we add descriptions to, so we can fail if # there aren't enough. num_descs_added = 0 # Count how many courses we fail to parse, so we can fail if there # are too many. num_failed = 0 # Get the first round of raw courses from Portal. raw_courses_1 = parse_portal_html(html) # Add course descriptions to them, using the raw course codes. # Also collect the course codes into a dictionary so that we can # deduplicate them. raw_courses_2 = [] course_info_map = collections.defaultdict(list) for raw_course in raw_courses_1: try: course_code = raw_course["course_code"].strip() course_info = shared.parse_course_code(course_code, with_section=True) desc_key = tuple( shared.course_info_as_list(course_info, with_section=False)) desc = desc_index.get(desc_key) if desc: num_descs_added += 1 raw_course["course_description"] = desc course_info_map[frozendict.frozendict(course_info)].append( raw_course) except ScrapeError as err: util.log_verbose( f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})" ) num_failed += 1 continue raw_courses_2.append(raw_course) if num_descs_added < 100: raise ScrapeError( f"not enough course descriptions added: {num_descs_added}") # Deduplicate course codes. raw_courses_3 = [] for course_info, courses in course_info_map.items(): if len(courses) > 1: if course_info["course_code_suffix"]: util.log_verbose( f"Duplicate course with suffix ({len(courses)} copies): " f"{format_raw_course(courses[0])!r}") num_failed += len(courses) continue if len(courses) > len(string.ascii_uppercase): util.log_verbose( f"Duplicate course with too many copies ({len(courses)}): " f"{format_raw_course(courses[0])!r}") num_failed += len(courses) continue for course, letter in zip(courses, string.ascii_uppercase): course["course_code_suffix"] = letter raw_courses_3.extend(courses) raw_courses = raw_courses_3 courses = [] for raw_course in raw_courses: try: courses.append(process_course(raw_course, term)) except ScrapeError as err: util.log_verbose( f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})" ) num_failed += 1 if num_failed >= 10: raise ScrapeError(f"Too many malformed courses: {num_failed}") num_succeeded = len(raw_courses) - num_failed if num_succeeded < 500: raise ScrapeError(f"Not enough courses: {num_succeeded}") util.log_verbose( f"Added descriptions to {num_descs_added} out of {num_succeeded} courses" ) return courses, term
def process_course(raw_course, term): """ Turn a raw course object into something that the frontend can use. Return a dictionary. If the raw course object has invalid data, raise ScrapeError. """ course_code = raw_course["course_code"].strip() course_info = shared.parse_course_code(course_code, with_section=True) course_code = shared.course_info_as_string(course_info) sort_key = shared.course_info_as_list(course_info, with_section=True) mutual_exclusion_key = shared.course_info_as_list(course_info, with_section=False) course_name = raw_course["course_name"].strip() if not course_name: raise ScrapeError("empty string for course name") faculty = sorted(set(f.strip() for f in raw_course["faculty"])) if not faculty: raise ScrapeError("no faculty") for faculty_name in faculty: if not faculty_name: raise ScrapeError("faculty with empty name") try: # careful: "∕" (`chr(8725)`) != "/" (`chr(47)`) filled_seats, total_seats = map(int, raw_course["seats"].split("∕")) except ValueError as err: raise ScrapeError( f"malformed seat count: {raw_course['seats']!r} ({err})") if filled_seats < 0: raise ScrapeError(f"negative filled seat count: {filled_seats}") if total_seats < 0: raise ScrapeError(f"negative total seat count: {total_seats}") course_status = raw_course["status"].lower().strip() if course_status not in ("open", "closed", "reopened"): raise ScrapeError(f"unknown course status: {course_status!r}") begin_date = dateutil.parser.parse(raw_course["begin_date"]).date() end_date = dateutil.parser.parse(raw_course["end_date"]).date() # First half-semester courses start (spring) January 1 through # January 31 or (fall) July 15 through September 15. (For some # reason, MATH 30B in Fall 2017 is listed as starting August 8.) first_half = datetime.date( begin_date.year, 1, 1) < begin_date < datetime.date( begin_date.year, 1, 31) or datetime.date( begin_date.year, 7, 15) < begin_date < datetime.date( begin_date.year, 9, 15) # Second half-semester courses for the spring end May 1 through # May 31, but there's also frosh chem pt.II which just *has* to be # different by ending 2/3 of the way through the semester. So we # also count that by allowing April 1 through April 30. Sigh. Fall # courses end December 1 through December 31. second_half = datetime.date( end_date.year, 4, 1) < end_date < datetime.date(end_date.year, 5, 31) or datetime.date( end_date.year, 12, 1) < end_date < datetime.date( end_date.year, 12, 31) if first_half and second_half: term_count = 1 terms = [0] elif first_half and not second_half: term_count = 2 terms = [0] elif second_half and not first_half: term_count = 2 terms = [1] else: raise ScrapeError( f"weird date range " f"{begin.date.strftime('%F')}-{end_date.strftime('%F')}") schedule = [] for slot in raw_course["schedule"]: if re.match(r"To Be Arranged\xa00?0:00 ?- ?0?0:00 ?AM", slot): continue match = re.match(SCHEDULE_REGEX, slot) if not match: raise ScrapeError(f"malformed schedule slot: {slot!r}") days, start, end, location = match.groups() for day in days: if day not in DAYS_OF_WEEK: raise ScrapeError( f"unknown day of week {day!r} in schedule slot {slot!r}") days = "".join(sorted(set(days), key=DAYS_OF_WEEK.index)) if not days: raise ScrapeError(f"no days in schedule slot {slot!r}") if not (start.endswith("AM") or start.endswith("PM")): start += end[-2:] try: start = dateutil.parser.parse(start).time() except ValueError: raise ScrapeError( f"malformed start time {start!r} in schedule slot {slot!r}") try: end = dateutil.parser.parse(end).time() except ValueError: raise ScrapeError( f"malformed end time {end!r} in schedule slot {slot!r}") location = " ".join(location.strip().split()) if not location: raise ScrapeError("empty string for location") # Start using camelCase here because we are constructing # objects that will be returned from the API as JSON -- no # longer just intermediate objects private to this module. schedule.append({ "scheduleDays": days, "scheduleStartTime": start.strftime("%H:%M"), "scheduleEndTime": end.strftime("%H:%M"), "scheduleStartDate": begin_date.strftime("%Y-%m-%d"), "scheduleEndDate": end_date.strftime("%Y-%m-%d"), "scheduleTermCount": term_count, "scheduleTerms": terms, "scheduleLocation": location, }) if not schedule: schedule.append({ "scheduleDays": "", "scheduleStartTime": "00:00", "scheduleEndTime": "00:00", "scheduleStartDate": begin_date.strftime("%Y-%m-%d"), "scheduleEndDate": end_date.strftime("%Y-%m-%d"), "scheduleTermCount": term_count, "scheduleTerms": terms, "scheduleLocation": "N/A", }) schedule = unique_preserve_order(schedule) num_credits = raw_course["credits"] try: num_credits = float(num_credits) except ValueError: raise ScrapeError(f"malformed credit count: {num_credits!r}") if num_credits < 0.0: raise ScrapeError(f"negative credit count: {raw_course['credits']}") if "Colloquium" in course_name and num_credits == 0.0: num_credits = 0.5 elif re.match("PE ", course_code) and num_credits == 0.0: num_credits = 1.0 elif num_credits == 0.25: num_credits = 1.0 elif not re.search(r"HM-", course_code): num_credits *= 3.0 if num_credits == 9.0: num_credits = 3.0 num_credits = str(num_credits) course_description = raw_course["course_description"] # just urls for now - we could add ratings or would take again percentages later urls = [] for prof in faculty: a = RateMyProfAPI(teacher=prof) # scrape the info from RateMyProfessors site a.fetch_info() urls.append(a.get_url()) return { "courseCode": course_code, "courseName": course_name, "courseSortKey": sort_key, "courseMutualExclusionKey": mutual_exclusion_key, "courseDescription": course_description, "courseInstructors": faculty, "courseInstructorRMPpages": urls, "courseTerm": term, "courseSchedule": schedule, "courseCredits": num_credits, "courseSeatsTotal": total_seats, "courseSeatsFilled": filled_seats, "courseWaitlistLength": None, "courseEnrollmentStatus": course_status, }
def process_course(raw_course, term): """ Turn a raw course object into something that the frontend can use. Return a dictionary. If the raw course object has invalid data, raise ScrapeError. """ course_code = raw_course["course_code"].strip() course_info = shared.parse_course_code(course_code, with_section=True) course_code = shared.course_info_as_string(course_info) sort_key = shared.course_info_as_list(course_info, with_section=True) mutual_exclusion_key = shared.course_info_as_list(course_info, with_section=False) course_name = raw_course["course_name"].strip() if not course_name: raise ScrapeError("empty string for course name") faculty = sorted(set(re.split(r"\s*\n\s*", raw_course["faculty"].strip()))) if not faculty: raise ScrapeError("no faculty") for faculty_name in faculty: if not faculty_name: raise ScrapeError("faculty with empty name") match = re.match(r"([0-9]+)/([0-9]+)", raw_course["seats"]) if not match: raise ScrapeError("malformed seat count: {}".format( repr(raw_course["seats"]))) filled_seats, total_seats = map(int, match.groups()) if filled_seats < 0: raise ScrapeError( "negative filled seat count: {}".format(filled_seats)) if total_seats < 0: raise ScrapeError("negative total seat count: {}".format(total_seats)) course_status = raw_course["status"].lower() if course_status not in ("open", "closed", "reopened"): raise ScrapeError("unknown course status: {}".format( repr(course_status))) begin_date = dateutil.parser.parse(raw_course["begin_date"]).date() end_date = dateutil.parser.parse(raw_course["end_date"]).date() # First half-semester courses start (spring) January 1 through # January 31 or (fall) July 15 through September 15. (For some # reason, MATH 30B in Fall 2017 is listed as starting August 8.) first_half = (datetime.date(begin_date.year, 1, 1) < begin_date < datetime.date(begin_date.year, 1, 31) or datetime.date(begin_date.year, 7, 15) < begin_date < datetime.date(begin_date.year, 9, 15)) # Second half-semester courses for the spring end May 1 through # May 31, but there's also frosh chem pt.II which just *has* to be # different by ending 2/3 of the way through the semester. So we # also count that by allowing April 1 through April 30. Sigh. Fall # courses end December 1 through December 31. second_half = (datetime.date( end_date.year, 4, 1) < end_date < datetime.date(end_date.year, 5, 31) or datetime.date(end_date.year, 12, 1) < end_date < datetime.date(end_date.year, 12, 31)) if first_half and second_half: term_count = 1 terms = [0] elif first_half and not second_half: term_count = 2 terms = [0] elif second_half and not first_half: term_count = 2 terms = [1] else: raise ScrapeError("weird date range {}-{}".format( begin_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))) schedule = [] for slot in raw_course["schedule"]: if slot.startswith("0:00 - 0:00 AM"): continue match = re.match(SCHEDULE_REGEX, slot) if not match: raise ScrapeError("malformed schedule slot: {}".format(repr(slot))) days, start, end, location = match.groups() for day in days: if day not in DAYS_OF_WEEK: raise ScrapeError( "unknown day of week {} in schedule slot {}".format( repr(day), repr(slot))) days = "".join(sorted(set(days), key=DAYS_OF_WEEK.index)) if not days: raise ScrapeError("no days in schedule slot {}".format(repr(slot))) if not (start.endswith("AM") or start.endswith("PM")): start += end[-2:] try: start = dateutil.parser.parse(start).time() except ValueError: raise ScrapeError( "malformed start time {} in schedule slot {}".format( repr(start), repr(slot))) try: end = dateutil.parser.parse(end).time() except ValueError: raise ScrapeError( "malformed end time {} in schedule slot {}".format( repr(end), repr(slot))) location = " ".join(location.strip().split()) if not location: raise ScrapeError("empty string for location") # Start using camelCase here because we are constructing # objects that will be returned from the API as JSON -- no # longer just intermediate objects private to this module. schedule.append({ "scheduleDays": days, "scheduleStartTime": start.strftime("%H:%M"), "scheduleEndTime": end.strftime("%H:%M"), "scheduleStartDate": begin_date.strftime("%Y-%m-%d"), "scheduleEndDate": end_date.strftime("%Y-%m-%d"), "scheduleTermCount": term_count, "scheduleTerms": terms, "scheduleLocation": location, }) if not schedule: schedule.append({ "scheduleDays": "", "scheduleStartTime": "00:00", "scheduleEndTime": "00:00", "scheduleStartDate": begin_date.strftime("%Y-%m-%d"), "scheduleEndDate": end_date.strftime("%Y-%m-%d"), "scheduleTermCount": term_count, "scheduleTerms": terms, "scheduleLocation": "N/A", }) schedule = unique_preserve_order(schedule) num_credits = raw_course["credits"] try: num_credits = float(num_credits) except ValueError: raise ScrapeError("malformed credit count: {}".format( repr(num_credits))) if num_credits < 0.0: raise ScrapeError("negative credit count: {}".format( raw_course["credits"])) if "Colloquium" in course_name and num_credits == 0.0: num_credits = 0.5 elif re.match("PE ", course_code) and num_credits == 0.0: num_credits = 1.0 elif num_credits == 0.25: num_credits = 1.0 elif not re.search(r"HM-", course_code): num_credits *= 3.0 if num_credits == 9.0: num_credits = 3.0 num_credits = str(num_credits) course_description = raw_course["course_description"] return { "courseCode": course_code, "courseName": course_name, "courseSortKey": sort_key, "courseMutualExclusionKey": mutual_exclusion_key, "courseDescription": course_description, "courseInstructors": faculty, "courseTerm": term, "courseSchedule": schedule, "courseCredits": num_credits, "courseSeatsTotal": total_seats, "courseSeatsFilled": filled_seats, "courseWaitlistLength": None, "courseEnrollmentStatus": course_status, }