Python ScrapeError 예제들, hyperschedule.util.ScrapeError Python 예제들

예제 #1

0

파일 보기

파일: portal.py 프로젝트: docfate111/hyperschedule-api

def parse_portal_html(html):
    """
    Given the Portal search results HTML as a string, return a list of
    raw course data dictionaries.

    If HTML is bad, raise ScrapeError.
    """
    soup = bs4.BeautifulSoup(html, "lxml")

    table = soup.find(id="tableCourses")
    if not table:
        raise ScrapeError("could not find course list table in Portal HTML")

    table_body = table.find("tbody")
    if not table_body:
        raise ScrapeError(
            "could not find course list table body in Portal HTML")

    table_rows = table_body.find_all("tr", recursive=False)
    if not table_rows:
        raise ScrapeError(
            "could not extract course list table rows from Portal HTML")

    raw_courses = []
    for row_idx, row in enumerate(table_rows):
        if "style" in row.attrs and row.attrs["style"] == "display:none;":
            continue
        raw_courses.append(parse_table_row(row_idx, row))

    return raw_courses

예제 #2

0

파일 보기

파일: lingk.py 프로젝트: g-rocket/hyperschedule-api

def lingk_csv_data_to_course_descriptions(data):
    header, *rows = data
    try:
        course_code_idx = header.index("courseNumber")
        desc_idx = header.index("description")
    except ValueError:
        raise ScrapeError(f"unexpected header: {header!r}") from None
    desc_map = {}
    for row in rows:
        # We have some rows that are completely empty and some that
        # are just whitespace.
        if not row or "".join(row).isspace():
            continue
        if len(row) != len(header):
            raise ScrapeError(f"malformed row: {row!r}")
        course_code = row[course_code_idx]
        try:
            course_info = shared.parse_course_code(course_code,
                                                   with_section=False)
        except ScrapeError:
            continue
        index_key = tuple(
            shared.course_info_as_list(course_info, with_section=False))
        description = row[desc_idx]
        if not description:
            continue
        description = " ".join(description.split())
        # If two conflicting descriptions for the same course code
        # (yep, it happens), pick the one that comes later :/
        desc_map[index_key] = description
    if len(desc_map) < 100:
        raise ScrapeError(
            f"Not enough course descriptions: {len(desc_map)}") from None
    return desc_map

예제 #3

0

파일 보기

파일: portal.py 프로젝트: MuddCreates/hyperschedule-scrapers

def parse_table_row(row_idx, row):
    """
    Given a Selenium table row and the index, return a dictionary
    representing the raw course data for that row.

    Raise ScrapeError if the HTML does not have the desired data.
    """
    elements = row.find_all("td")
    try:
        (_add, course_code, name, faculty, seats, status, schedule,
         num_credits, begin, end) = elements
    except ValueError:
        raise ScrapeError("could not extract course list table row elements "
                          "from Portal HTML (for row {})".format(row_idx))
    return {
        "course_code": course_code.text,
        "course_name": name.text,
        "faculty": faculty.text,
        "seats": seats.text,
        "status": status.text,
        "schedule": [stime.text for stime in schedule.find_all("li")],
        "credits": num_credits.text,
        "begin_date": begin.text,
        "end_date": end.text,
    }

예제 #4

0

파일 보기

파일: lingk.py 프로젝트: MuddCreates/hyperschedule-scrapers

def get_course_descriptions():
    """
    Given a Lingk API key and secret for authentication, return a
    dictionary mapping course codes (as can be used on the frontend)
    to course descriptions.

    Throw ScrapeError if the API is not available or returns bad data.
    """
    if util.get_env_boolean("lingk"):
        key = os.environ.get("HYPERSCHEDULE_LINGK_KEY")
        secret = os.environ.get("HYPERSCHEDULE_LINGK_SECRET")
        if not key or not secret:
            util.log("Skipping Lingk as key and secret are not set")
            return {}
        util.log_verbose("Scraping Lingk API")
        data = get_lingk_api_data(key, secret)
        desc_index = lingk_api_data_to_course_descriptions(data)
    else:
        util.log_verbose("Scraping Lingk CSV")
        data = get_lingk_csv_data()
        desc_index = lingk_csv_data_to_course_descriptions(data)
    if len(desc_index) < 100:
        raise ScrapeError("Not enough course descriptions: {}".format(
            len(desc_index)))
    return desc_index

예제 #5

0

파일 보기

파일: lingk.py 프로젝트: MuddCreates/hyperschedule-scrapers

def get_lingk_api_data(key, secret):
    """
    Return the decoded JSON response from the Lingk API, using the
    given key and secret for authentication.

    Throw ScrapeError if the API is not available or returns bad data.
    """
    # For some bizarre reason the Lingk API sometimes returns 401
    # Unauthorized even when you are authenticated correctly. Asking
    # again a few times fixes the issue. I don't even want to know
    # why.
    last_error = None
    fails = 0
    for i in range(LINGK_RETRY_COUNT):
        now = datetime.datetime.utcnow()
        date = now.strftime("%a, %d %b %Y %H:%M:%S UTC")
        response = requests.get(LINGK_URL,
                                headers={
                                    "Date":
                                    date,
                                    "Authorization":
                                    get_auth_header(key, secret, date)
                                })
        try:
            response.raise_for_status()
            return response.json()
        except requests.exceptions.HTTPError as e:
            fails += 1
            util.log_verbose(
                "Got auth error from Lingk API ({} of {} allowed)".format(
                    fails, LINGK_RETRY_COUNT))
            time.sleep(1)
            last_error = e
            continue
        except ValueError:
            raise ScrapeError("Lingk API returned no data")
        except json.decoder.JSONDecodeError:
            raise ScrapeError("Lingk API did not return valid JSON")
    raise ScrapeError(
        "Lingk API returned error response: {}".format(last_error))

예제 #6

0

파일 보기

def get_portal_html(browser):
    """
    Given a Selenium browser object, perform a webscrape of Portal.
    Return a tuple (html, term) with the HTML of the course search
    results page as a string and the current term (for which courses
    were retrieved) also as a string.

    Raise ScrapeError if something goes wrong with the browser or
    Portal.
    """
    util.log_verbose(f"Current scraper IP is {get_ip()}")
    util.log_verbose("Scraping Portal")

    browser.set_window_size(1920, 1200)

    url = ("https://portal.hmc.edu/ICS/Portal_Homepage.jnz?"
           "portlet=Course_Schedules&screen=Advanced+Course+Search"
           "&screenType=next")
    browser.get(url)

    term_dropdown = selenium.webdriver.support.ui.Select(
        browser.find_element_by_id("pg0_V_ddlTerm"))
    term_names = [option.text for option in term_dropdown.options]

    terms_info = []
    for term_name in term_names:
        match = re.match(r"\s*(FA|SP)\s*([0-9]{4})\s*", term_name)
        if match:
            fall_or_spring, year_str = match.groups()
            terms_info.append(
                (int(year_str), fall_or_spring == "FA", term_name))

    if not terms_info:
        raise ScrapeError(
            f"couldn't parse any term names (from: {term_names!r})")

    term_info = max(terms_info)
    term = term_info[2]
    term_dropdown.select_by_visible_text(term)

    title_input = browser.find_element_by_id("pg0_V_txtTitleRestrictor")
    title_input.clear()
    title_input.send_keys("?")

    search_button = browser.find_element_by_id("pg0_V_btnSearch")
    search_button.click()

    show_all_checkbox = browser.find_element_by_id("pg0_V_lnkShowAll")
    show_all_checkbox.click()

    return browser.page_source, " ".join(term.split())

예제 #7

0

파일 보기

파일: lingk.py 프로젝트: MuddCreates/hyperschedule-scrapers

def lingk_api_data_to_course_descriptions(data):
    """
    Given the decoded JSON from the Lingk API, return a dictionary
    mapping tuples of course information (`with_section` false; see
    `shared.course_info_as_list`) to course descriptions.

    Throw ScrapeError if the data is malformed.
    """
    if not isinstance(data, dict):
        raise ScrapeError("Lingk JSON is not map")
    if "data" not in data:
        raise ScrapeError("Lingk JSON is missing 'data' field")
    desc_index = {}
    for idx, course in enumerate(data["data"]):
        if "description" not in course:
            continue
        description = course["description"]
        if not isinstance(description, str):
            raise ScrapeError(
                "'description' at index {} is not string".format(idx))
        if "courseNumber" not in course:
            raise ScrapeError(
                "Lingk JSON at index {} is missing 'courseNumber' field".
                format(idx))
        course_code = course["courseNumber"]
        # Special case that doesn't show up on Portal.
        if course_code == "ABROAD   HM":
            continue
        course_info = shared.parse_course_code(course_code, with_section=False)
        course_key = tuple(
            shared.course_info_as_list(course_info, with_section=False))
        found_mismatch = (course_key in desc_index
                          and desc_index[course_key] != description)
        if found_mismatch:
            raise ScrapeError("Lingk JSON has duplicate course: {}".format(
                repr(course_key)))
        desc_index[course_key] = description
    return desc_index

예제 #8

0

파일 보기

def try_compute_data(s3, webhook, old_data):
    """
    Try to run the scraper and return course data. If something goes
    wrong, raise `ScrapeError`. Otherwise, invoke the provided
    `Webhook`. `old_data` is the previous course data or `util.Unset`.
    """
    scraper_timeout = util.get_env("scraper_timeout")
    try:
        scraper_timeout = int(scraper_timeout)
        if scraper_timeout <= 0:
            raise ValueError
    except ValueError:
        util.warn("Illegal scraper timeout: {}".format(repr(scraper_timeout)))
        util.log("Resetting timeout to 60 seconds")
        os.environ["HYPERSCHEDULE_SCRAPER_TIMEOUT"] = "60"
        scraper_timeout = 60
    if old_data is util.Unset:
        # For JSON.
        old_data = None
    try:
        util.log("Running scraper")
        process = subprocess.Popen(
            ["python", "-m", "hyperschedule.scrapers.claremont"],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
        )
        output, _ = process.communicate(input=json.dumps(old_data).encode(),
                                        timeout=scraper_timeout)
        if process.returncode != 0:
            raise ScrapeError("scraper failed")
        try:
            output = output.decode()
        except UnicodeDecodeError as e:
            raise ScrapeError(
                "scraper emitted malformed output: {}".format(e)) from None
        if "$delete" in output:
            raise ScrapeError("scraper output contains '$delete'")
        data = json.loads(output)
        if util.get_env_boolean("snitch"):
            webhook.get()
        if util.get_env_boolean("cache"):
            cache_file_write(data)
        if util.get_env_boolean("s3_write"):
            s3_write(s3, data)
    except OSError as e:
        raise ScrapeError(
            "unexpected error while running scraper: {}".format(e)) from None
    except subprocess.TimeoutExpired:
        process.kill()
        process.communicate()
        raise ScrapeError("scraper timed out after {} seconds".format(
            scraper_timeout)) from None
    except json.decoder.JSONDecodeError:
        raise ScrapeError("scraper did not return valid JSON") from None
    except requests.exceptions.RequestException as e:
        util.warn("failed to reach success webhook: {}".format(e))
    return data

예제 #9

0

파일 보기

def parse_table_row(row_idx, row):
    """
    Given a Selenium table row and the index, return a dictionary
    representing the raw course data for that row.

    Raise ScrapeError if the HTML does not have the desired data.
    """
    elements = row.find_all("td")
    try:
        (
            _tb,
            _add,
            course_code,
            name,
            _req,
            _note,
            seats,
            status,
            faculty_and_schedule,
            num_credits,
            begin,
            end,
        ) = elements
    except ValueError:
        raise ScrapeError("could not extract course list table row elements "
                          f"from Portal HTML (for row {row_idx})")
    all_faculty = []
    schedule = []
    for item in faculty_and_schedule.find_all("li"):
        try:
            faculty, meeting = item.text.split(" / ")
            # This list gets uniquified later.
            all_faculty.append(faculty)
            schedule.append(meeting)
        except ValueError:
            # No "/" separator, assumed to mean only schedule (no
            # faculty).
            schedule.append(item.text)
    return {
        "course_code": course_code.text,
        "course_name": name.text,
        "faculty": all_faculty,
        "seats": seats.text,
        "status": status.text,
        "schedule": schedule,
        "credits": num_credits.text,
        "begin_date": begin.text,
        "end_date": end.text,
    }

예제 #10

0

파일 보기

파일: shared.py 프로젝트: ssantichaivekin/hyperschedule-scraper

def parse_term_code(term):
    """
    Given a term code (e.g. "FA 2018"), return a dictionary with keys:

    * year (integer)
    * fall (boolean)
    * spring (boolean)
    """
    match = re.match(r"(FA|SP)\s*(20[0-9]{2})", term)
    if not match:
        raise ScrapeError("malformed term code: {}".format(repr(term)))
    return {
        "year": int(match.group(2)),
        "fall": match.group(1) == "FA",
        "spring": match.group(1) == "SP",
    }

예제 #11

0

파일 보기

파일: portal.py 프로젝트: docfate111/hyperschedule-api

def get_courses(desc_index):
    """
    Return a tuple containing the list of course objects and the
    current term. Takes `desc_index` as returned by
    `lingk.get_course_descriptions`.
    """
    browser = get_browser()
    html, term = get_portal_html(browser)
    # Save on memory.
    scraper.kill_google_chrome()
    # Count how many courses we add descriptions to, so we can fail if
    # there aren't enough.
    num_descs_added = 0
    # Count how many courses we fail to parse, so we can fail if there
    # are too many.
    num_failed = 0
    # Get the first round of raw courses from Portal.
    raw_courses_1 = parse_portal_html(html)
    # Add course descriptions to them, using the raw course codes.
    # Also collect the course codes into a dictionary so that we can
    # deduplicate them.
    raw_courses_2 = []
    course_info_map = collections.defaultdict(list)
    for raw_course in raw_courses_1:
        try:
            course_code = raw_course["course_code"].strip()
            course_info = shared.parse_course_code(course_code,
                                                   with_section=True)
            desc_key = tuple(
                shared.course_info_as_list(course_info, with_section=False))
            desc = desc_index.get(desc_key)
            if desc:
                num_descs_added += 1
            raw_course["course_description"] = desc
            course_info_map[frozendict.frozendict(course_info)].append(
                raw_course)
        except ScrapeError as err:
            util.log_verbose(
                f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})"
            )
            num_failed += 1
            continue
        raw_courses_2.append(raw_course)
    if num_descs_added < 100:
        raise ScrapeError(
            f"not enough course descriptions added: {num_descs_added}")
    # Deduplicate course codes.
    raw_courses_3 = []
    for course_info, courses in course_info_map.items():
        if len(courses) > 1:
            if course_info["course_code_suffix"]:
                util.log_verbose(
                    f"Duplicate course with suffix ({len(courses)} copies): "
                    f"{format_raw_course(courses[0])!r}")
                num_failed += len(courses)
                continue
            if len(courses) > len(string.ascii_uppercase):
                util.log_verbose(
                    f"Duplicate course with too many copies ({len(courses)}): "
                    f"{format_raw_course(courses[0])!r}")
                num_failed += len(courses)
                continue
            for course, letter in zip(courses, string.ascii_uppercase):
                course["course_code_suffix"] = letter
        raw_courses_3.extend(courses)
    raw_courses = raw_courses_3
    courses = []
    for raw_course in raw_courses:
        try:
            courses.append(process_course(raw_course, term))
        except ScrapeError as err:
            util.log_verbose(
                f"Failed to parse course: {format_raw_course(raw_course)!r} ({err})"
            )
            num_failed += 1
    if num_failed >= 10:
        raise ScrapeError(f"Too many malformed courses: {num_failed}")
    num_succeeded = len(raw_courses) - num_failed
    if num_succeeded < 500:
        raise ScrapeError(f"Not enough courses: {num_succeeded}")
    util.log_verbose(
        f"Added descriptions to {num_descs_added} out of {num_succeeded} courses"
    )
    return courses, term

예제 #12

0

파일 보기

파일: portal.py 프로젝트: docfate111/hyperschedule-api

def process_course(raw_course, term):
    """
    Turn a raw course object into something that the frontend can use.
    Return a dictionary.

    If the raw course object has invalid data, raise ScrapeError.
    """
    course_code = raw_course["course_code"].strip()
    course_info = shared.parse_course_code(course_code, with_section=True)
    course_code = shared.course_info_as_string(course_info)
    sort_key = shared.course_info_as_list(course_info, with_section=True)
    mutual_exclusion_key = shared.course_info_as_list(course_info,
                                                      with_section=False)
    course_name = raw_course["course_name"].strip()
    if not course_name:
        raise ScrapeError("empty string for course name")
    faculty = sorted(set(f.strip() for f in raw_course["faculty"]))
    if not faculty:
        raise ScrapeError("no faculty")
    for faculty_name in faculty:
        if not faculty_name:
            raise ScrapeError("faculty with empty name")
    try:
        # careful: "∕" (`chr(8725)`) != "/" (`chr(47)`)
        filled_seats, total_seats = map(int, raw_course["seats"].split("∕"))
    except ValueError as err:
        raise ScrapeError(
            f"malformed seat count: {raw_course['seats']!r} ({err})")
    if filled_seats < 0:
        raise ScrapeError(f"negative filled seat count: {filled_seats}")
    if total_seats < 0:
        raise ScrapeError(f"negative total seat count: {total_seats}")
    course_status = raw_course["status"].lower().strip()
    if course_status not in ("open", "closed", "reopened"):
        raise ScrapeError(f"unknown course status: {course_status!r}")
    begin_date = dateutil.parser.parse(raw_course["begin_date"]).date()
    end_date = dateutil.parser.parse(raw_course["end_date"]).date()
    # First half-semester courses start (spring) January 1 through
    # January 31 or (fall) July 15 through September 15. (For some
    # reason, MATH 30B in Fall 2017 is listed as starting August 8.)
    first_half = datetime.date(
        begin_date.year, 1, 1) < begin_date < datetime.date(
            begin_date.year, 1, 31) or datetime.date(
                begin_date.year, 7, 15) < begin_date < datetime.date(
                    begin_date.year, 9, 15)
    # Second half-semester courses for the spring end May 1 through
    # May 31, but there's also frosh chem pt.II which just *has* to be
    # different by ending 2/3 of the way through the semester. So we
    # also count that by allowing April 1 through April 30. Sigh. Fall
    # courses end December 1 through December 31.
    second_half = datetime.date(
        end_date.year, 4,
        1) < end_date < datetime.date(end_date.year, 5, 31) or datetime.date(
            end_date.year, 12, 1) < end_date < datetime.date(
                end_date.year, 12, 31)
    if first_half and second_half:
        term_count = 1
        terms = [0]
    elif first_half and not second_half:
        term_count = 2
        terms = [0]
    elif second_half and not first_half:
        term_count = 2
        terms = [1]
    else:
        raise ScrapeError(
            f"weird date range "
            f"{begin.date.strftime('%F')}-{end_date.strftime('%F')}")
    schedule = []
    for slot in raw_course["schedule"]:
        if re.match(r"To Be Arranged\xa00?0:00 ?- ?0?0:00 ?AM", slot):
            continue
        match = re.match(SCHEDULE_REGEX, slot)
        if not match:
            raise ScrapeError(f"malformed schedule slot: {slot!r}")
        days, start, end, location = match.groups()
        for day in days:
            if day not in DAYS_OF_WEEK:
                raise ScrapeError(
                    f"unknown day of week {day!r} in schedule slot {slot!r}")
        days = "".join(sorted(set(days), key=DAYS_OF_WEEK.index))
        if not days:
            raise ScrapeError(f"no days in schedule slot {slot!r}")
        if not (start.endswith("AM") or start.endswith("PM")):
            start += end[-2:]
        try:
            start = dateutil.parser.parse(start).time()
        except ValueError:
            raise ScrapeError(
                f"malformed start time {start!r} in schedule slot {slot!r}")
        try:
            end = dateutil.parser.parse(end).time()
        except ValueError:
            raise ScrapeError(
                f"malformed end time {end!r} in schedule slot {slot!r}")
        location = " ".join(location.strip().split())
        if not location:
            raise ScrapeError("empty string for location")
        # Start using camelCase here because we are constructing
        # objects that will be returned from the API as JSON -- no
        # longer just intermediate objects private to this module.
        schedule.append({
            "scheduleDays": days,
            "scheduleStartTime": start.strftime("%H:%M"),
            "scheduleEndTime": end.strftime("%H:%M"),
            "scheduleStartDate": begin_date.strftime("%Y-%m-%d"),
            "scheduleEndDate": end_date.strftime("%Y-%m-%d"),
            "scheduleTermCount": term_count,
            "scheduleTerms": terms,
            "scheduleLocation": location,
        })
    if not schedule:
        schedule.append({
            "scheduleDays": "",
            "scheduleStartTime": "00:00",
            "scheduleEndTime": "00:00",
            "scheduleStartDate": begin_date.strftime("%Y-%m-%d"),
            "scheduleEndDate": end_date.strftime("%Y-%m-%d"),
            "scheduleTermCount": term_count,
            "scheduleTerms": terms,
            "scheduleLocation": "N/A",
        })
    schedule = unique_preserve_order(schedule)
    num_credits = raw_course["credits"]
    try:
        num_credits = float(num_credits)
    except ValueError:
        raise ScrapeError(f"malformed credit count: {num_credits!r}")
    if num_credits < 0.0:
        raise ScrapeError(f"negative credit count: {raw_course['credits']}")
    if "Colloquium" in course_name and num_credits == 0.0:
        num_credits = 0.5
    elif re.match("PE ", course_code) and num_credits == 0.0:
        num_credits = 1.0
    elif num_credits == 0.25:
        num_credits = 1.0
    elif not re.search(r"HM-", course_code):
        num_credits *= 3.0
    if num_credits == 9.0:
        num_credits = 3.0
    num_credits = str(num_credits)
    course_description = raw_course["course_description"]
    # just urls for now - we could add ratings or would take again percentages later
    urls = []
    for prof in faculty:
        a = RateMyProfAPI(teacher=prof)
        # scrape the info from RateMyProfessors site
        a.fetch_info()
        urls.append(a.get_url())
    return {
        "courseCode": course_code,
        "courseName": course_name,
        "courseSortKey": sort_key,
        "courseMutualExclusionKey": mutual_exclusion_key,
        "courseDescription": course_description,
        "courseInstructors": faculty,
        "courseInstructorRMPpages": urls,
        "courseTerm": term,
        "courseSchedule": schedule,
        "courseCredits": num_credits,
        "courseSeatsTotal": total_seats,
        "courseSeatsFilled": filled_seats,
        "courseWaitlistLength": None,
        "courseEnrollmentStatus": course_status,
    }

예제 #13

0

파일 보기

파일: shared.py 프로젝트: ssantichaivekin/hyperschedule-scraper

def parse_course_code(course_code, with_section):
    """
    Given a course code in the format used by Portal and Lingk, with
    or without a section number ("PHIL 179A HM-01" or just "PHIL179A
    HM") as controlled by `with_section`, parse it and return a
    dictionary with keys:

    - department (string)
    - course_number (integer)
    - course_code_suffix (string)
    - school (string)
    - section (integer, or null if with_section is false)

    The given course code may also be in the format returned by
    `course_info_as_string`.

    Throw ScrapeError if parsing fails.
    """
    match = re.match(COURSE_REGEX, course_code)
    if not match:
        raise ScrapeError("malformed course code: {}".format(
            repr(course_code)))
    department, course_number, num_suffix, school, section = match.groups()
    if not department:
        raise ScrapeError("empty string for department")
    if "/" in department:
        raise ScrapeError("department contains slashes: {}".format(
            repr(department)))
    try:
        course_number = int(course_number)
    except ValueError:
        raise ScrapeError("malformed course number: {}".format(
            repr(course_number)))
    if course_number <= 0:
        raise ScrapeError(
            "non-positive course number: {}".format(course_number))
    if "/" in num_suffix:
        raise ScrapeError("course code suffix contains slashes: {}".format(
            repr(num_suffix)))
    if not school:
        raise ScrapeError("empty string for school")
    if "/" in school:
        raise ScrapeError("school contains slashes: {}".format(repr(school)))
    if bool(section) != bool(with_section):
        if with_section:
            raise ScrapeError("section missing")
        else:
            raise ScrapeError("section unexpectedly present: {}".format(
                repr(section)))
    if section:
        try:
            section = int(section)
        except ValueError:
            raise ScrapeError("malformed section number: {}".format(
                repr(section)))
        if section <= 0:
            raise ScrapeError(
                "non-positive section number: {}".format(section))
    # If section is None, just leave it as is.
    return {
        "department": department,
        "courseNumber": course_number,
        "courseCodeSuffix": num_suffix,
        "school": school,
        "section": section,
    }

예제 #14

0

파일 보기

파일: portal.py 프로젝트: MuddCreates/hyperschedule-scrapers

def process_course(raw_course, term):
    """
    Turn a raw course object into something that the frontend can use.
    Return a dictionary.

    If the raw course object has invalid data, raise ScrapeError.
    """
    course_code = raw_course["course_code"].strip()
    course_info = shared.parse_course_code(course_code, with_section=True)
    course_code = shared.course_info_as_string(course_info)
    sort_key = shared.course_info_as_list(course_info, with_section=True)
    mutual_exclusion_key = shared.course_info_as_list(course_info,
                                                      with_section=False)
    course_name = raw_course["course_name"].strip()
    if not course_name:
        raise ScrapeError("empty string for course name")
    faculty = sorted(set(re.split(r"\s*\n\s*", raw_course["faculty"].strip())))
    if not faculty:
        raise ScrapeError("no faculty")
    for faculty_name in faculty:
        if not faculty_name:
            raise ScrapeError("faculty with empty name")
    match = re.match(r"([0-9]+)/([0-9]+)", raw_course["seats"])
    if not match:
        raise ScrapeError("malformed seat count: {}".format(
            repr(raw_course["seats"])))
    filled_seats, total_seats = map(int, match.groups())
    if filled_seats < 0:
        raise ScrapeError(
            "negative filled seat count: {}".format(filled_seats))
    if total_seats < 0:
        raise ScrapeError("negative total seat count: {}".format(total_seats))
    course_status = raw_course["status"].lower()
    if course_status not in ("open", "closed", "reopened"):
        raise ScrapeError("unknown course status: {}".format(
            repr(course_status)))
    begin_date = dateutil.parser.parse(raw_course["begin_date"]).date()
    end_date = dateutil.parser.parse(raw_course["end_date"]).date()
    # First half-semester courses start (spring) January 1 through
    # January 31 or (fall) July 15 through September 15. (For some
    # reason, MATH 30B in Fall 2017 is listed as starting August 8.)
    first_half = (datetime.date(begin_date.year, 1, 1) < begin_date <
                  datetime.date(begin_date.year, 1, 31)
                  or datetime.date(begin_date.year, 7, 15) < begin_date <
                  datetime.date(begin_date.year, 9, 15))
    # Second half-semester courses for the spring end May 1 through
    # May 31, but there's also frosh chem pt.II which just *has* to be
    # different by ending 2/3 of the way through the semester. So we
    # also count that by allowing April 1 through April 30. Sigh. Fall
    # courses end December 1 through December 31.
    second_half = (datetime.date(
        end_date.year, 4, 1) < end_date < datetime.date(end_date.year, 5, 31)
                   or datetime.date(end_date.year, 12, 1) < end_date <
                   datetime.date(end_date.year, 12, 31))
    if first_half and second_half:
        term_count = 1
        terms = [0]
    elif first_half and not second_half:
        term_count = 2
        terms = [0]
    elif second_half and not first_half:
        term_count = 2
        terms = [1]
    else:
        raise ScrapeError("weird date range {}-{}".format(
            begin_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")))
    schedule = []
    for slot in raw_course["schedule"]:
        if slot.startswith("0:00 - 0:00 AM"):
            continue
        match = re.match(SCHEDULE_REGEX, slot)
        if not match:
            raise ScrapeError("malformed schedule slot: {}".format(repr(slot)))
        days, start, end, location = match.groups()
        for day in days:
            if day not in DAYS_OF_WEEK:
                raise ScrapeError(
                    "unknown day of week {} in schedule slot {}".format(
                        repr(day), repr(slot)))
        days = "".join(sorted(set(days), key=DAYS_OF_WEEK.index))
        if not days:
            raise ScrapeError("no days in schedule slot {}".format(repr(slot)))
        if not (start.endswith("AM") or start.endswith("PM")):
            start += end[-2:]
        try:
            start = dateutil.parser.parse(start).time()
        except ValueError:
            raise ScrapeError(
                "malformed start time {} in schedule slot {}".format(
                    repr(start), repr(slot)))
        try:
            end = dateutil.parser.parse(end).time()
        except ValueError:
            raise ScrapeError(
                "malformed end time {} in schedule slot {}".format(
                    repr(end), repr(slot)))
        location = " ".join(location.strip().split())
        if not location:
            raise ScrapeError("empty string for location")
        # Start using camelCase here because we are constructing
        # objects that will be returned from the API as JSON -- no
        # longer just intermediate objects private to this module.
        schedule.append({
            "scheduleDays": days,
            "scheduleStartTime": start.strftime("%H:%M"),
            "scheduleEndTime": end.strftime("%H:%M"),
            "scheduleStartDate": begin_date.strftime("%Y-%m-%d"),
            "scheduleEndDate": end_date.strftime("%Y-%m-%d"),
            "scheduleTermCount": term_count,
            "scheduleTerms": terms,
            "scheduleLocation": location,
        })
    if not schedule:
        schedule.append({
            "scheduleDays": "",
            "scheduleStartTime": "00:00",
            "scheduleEndTime": "00:00",
            "scheduleStartDate": begin_date.strftime("%Y-%m-%d"),
            "scheduleEndDate": end_date.strftime("%Y-%m-%d"),
            "scheduleTermCount": term_count,
            "scheduleTerms": terms,
            "scheduleLocation": "N/A",
        })
    schedule = unique_preserve_order(schedule)
    num_credits = raw_course["credits"]
    try:
        num_credits = float(num_credits)
    except ValueError:
        raise ScrapeError("malformed credit count: {}".format(
            repr(num_credits)))
    if num_credits < 0.0:
        raise ScrapeError("negative credit count: {}".format(
            raw_course["credits"]))
    if "Colloquium" in course_name and num_credits == 0.0:
        num_credits = 0.5
    elif re.match("PE ", course_code) and num_credits == 0.0:
        num_credits = 1.0
    elif num_credits == 0.25:
        num_credits = 1.0
    elif not re.search(r"HM-", course_code):
        num_credits *= 3.0
    if num_credits == 9.0:
        num_credits = 3.0
    num_credits = str(num_credits)
    course_description = raw_course["course_description"]
    return {
        "courseCode": course_code,
        "courseName": course_name,
        "courseSortKey": sort_key,
        "courseMutualExclusionKey": mutual_exclusion_key,
        "courseDescription": course_description,
        "courseInstructors": faculty,
        "courseTerm": term,
        "courseSchedule": schedule,
        "courseCredits": num_credits,
        "courseSeatsTotal": total_seats,
        "courseSeatsFilled": filled_seats,
        "courseWaitlistLength": None,
        "courseEnrollmentStatus": course_status,
    }