示例#1
0
def courses_computed(
    courses: pd.DataFrame,
    listings: pd.DataFrame,
    evaluation_statistics: pd.DataFrame,
    course_professors: pd.DataFrame,
) -> pd.DataFrame:
    """
    Populates computed course rating fields:
        average_rating:
            Average course rating over all past instances.
        average_workload:
            Average course workload over all past instances.

    Also populates last-offered course fields:
        last_offered_course_id:
            course_id of the most recent previous offering.
        last_enrollment_course_id:
            course_id of the most recent previous offering with enrollment statistics.
        last_enrollment:
            Number of students in most recent previous offering with enrollment statistics.
        last_enrollment_season_code:
            Season of recent previous offering with enrollment statistics.
        last_enrollment_same_professors:
            If recent previous offering with enrollment statistics was with same professors.

    Parameters
    ----------
    Pandas tables post-import:
        courses
        listings
        evaluation_statistics
        course_professors

    Returns
    -------
    courses:
        Table with computed fields.
    """
    listings = listings.copy(deep=True)
    evaluation_statistics = evaluation_statistics.copy(deep=True)
    course_professors = course_professors.copy(deep=True)

    (
        course_to_same_course,
        same_course_to_courses,
        course_to_same_course_filtered,
        same_course_to_courses_filtered,
    ) = resolve_historical_courses(courses, listings)

    # partition ID of same-codes courses (not used anymore, useful for debugging)
    courses["shared_code_id"] = courses["course_id"].apply(
        course_to_same_course.get)
    # connected courses with the same code (not used anymore, useful for debugging)
    courses["shared_code_courses"] = courses["shared_code_id"].apply(
        same_course_to_courses.get)

    # unique ID for each partition of the same courses
    courses["same_course_id"] = courses["course_id"].apply(
        course_to_same_course_filtered.get)

    # list of course_ids that are the same course per course_id
    courses["same_courses"] = courses["same_course_id"].apply(
        same_course_to_courses_filtered.get)

    # split same-course partition by same-professors
    course_to_same_prof_course, same_prof_course_to_courses = split_same_professors(
        course_to_same_course_filtered, course_professors)

    # unique ID for each partition of the same courses taught by the same set of profs
    courses["same_course_and_profs_id"] = courses["course_id"].apply(
        course_to_same_prof_course.get)

    # list of course_ids that are the same course and taught by same profs per course_id
    courses["same_courses_and_profs"] = courses[
        "same_course_and_profs_id"].apply(same_prof_course_to_courses.get)

    # map course_id to professor_ids
    # use frozenset because it is hashable (set is not), needed for groupby
    course_to_professors = course_professors.groupby(
        "course_id")[  # type: ignore
            "professor_id"].apply(frozenset)

    # get historical offerings with same professors
    listings["professors"] = listings["course_id"].apply(
        course_to_professors.get)
    courses["professors"] = courses["course_id"].apply(
        course_to_professors.get)

    print("Computing last offering statistics")

    # course_id for all evaluated courses
    evaluated_courses = set(
        evaluation_statistics.dropna(subset=["enrolled"], axis=0)["course_id"])

    # map course_id to season
    course_to_season = dict(zip(courses["course_id"], courses["season_code"]))

    # map course_id to number enrolled
    course_to_enrollment = dict(
        zip(evaluation_statistics["course_id"],
            evaluation_statistics["enrolled"]))

    # get last course offering in general (with or without enrollment)
    def get_last_offered(course_row):
        same_courses = course_row["same_courses"]

        same_courses = [
            x for x in same_courses
            if course_to_season[x] < course_row["season_code"]
        ]

        if len(same_courses) == 0:
            return None

        same_courses = [
            x for x in same_courses if x is not course_row["course_id"]
        ]
        if len(same_courses) == 0:
            return None

        last_offered_course = max(same_courses,
                                  key=lambda x: course_to_season[x])

        return last_offered_course

    # helper function for getting enrollment fields of last-offered course
    def get_last_offered_enrollment(course_row):
        same_courses = course_row["same_courses"]

        # keep course only if distinct, has enrollment statistics, and is before current
        same_courses = [
            x for x in same_courses if x in evaluated_courses
            and course_to_season[x] < course_row["season_code"]
        ]
        if len(same_courses) == 0:
            return [None, None, None, None]
        same_courses = [
            x for x in same_courses if x is not course_row["course_id"]
        ]
        if len(same_courses) == 0:
            return [None, None, None, None]

        current_professors = course_to_professors.get(course_row["course_id"],
                                                      set())

        # sort courses newest-first
        same_courses = sorted(same_courses,
                              key=lambda x: course_to_season[x],
                              reverse=True)

        # get the newest course with the same professors, otherwise just the newest course
        last_enrollment_course = next(
            (prev_course
             for prev_course in same_courses if course_to_professors.get(
                 prev_course, set()) == current_professors),
            # default to newest course if no previous course has same profs
            same_courses[0],
        )

        # number of students last taking course
        last_enrollment = course_to_enrollment[last_enrollment_course]
        # season for last enrollment
        last_enrollment_season = course_to_season[last_enrollment_course]
        # professors for last enrollment
        last_enrollment_professors = course_to_professors.get(
            last_enrollment_course, set())

        # if last enrollment is with same professors
        last_enrollment_same_professors = (
            last_enrollment_professors == current_professors)

        return (
            last_enrollment_course,
            last_enrollment,
            last_enrollment_season,
            last_enrollment_same_professors,
        )

    tqdm.pandas(desc="Finding last-offered course")
    courses["last_offered_course_id"] = courses.progress_apply(  # type: ignore
        get_last_offered, axis=1)

    tqdm.pandas(desc="Finding last-offered enrollment")
    # getting last-offered enrollment
    (
        courses["last_enrollment_course_id"],
        courses["last_enrollment"],
        courses["last_enrollment_season_code"],
        courses["last_enrollment_same_professors"],
    ) = zip(*courses.progress_apply(get_last_offered_enrollment,
                                    axis=1)  # type: ignore
            )

    print("Computing historical ratings for courses")

    # map courses to ratings
    course_to_overall = dict(
        zip(evaluation_statistics["course_id"],
            evaluation_statistics["avg_rating"]))
    course_to_workload = dict(
        zip(evaluation_statistics["course_id"],
            evaluation_statistics["avg_workload"]))

    # get ratings
    courses["average_rating"] = courses["same_courses"].apply(
        lambda courses: [course_to_overall.get(x) for x in courses])
    courses["average_workload"] = courses["same_courses"].apply(
        lambda courses: [course_to_workload.get(x) for x in courses])

    courses["average_rating_same_professors"] = courses[
        "same_courses_and_profs"].apply(
            lambda courses: [course_to_overall.get(x) for x in courses])
    courses["average_workload_same_professors"] = courses[
        "same_courses_and_profs"].apply(
            lambda courses: [course_to_workload.get(x) for x in courses])

    # calculate the average of an array
    def average(nums):
        nums = list(filter(lambda x: x is not None, nums))
        nums = list(filter(lambda x: not math.isnan(x), nums))
        if not nums:
            return [None, None]
        num_obs = len(nums)
        return (sum(nums) / num_obs, num_obs)

    # calculate averages over past offerings
    for average_col, num_col in [
        ("average_rating", "average_rating_n"),
        ("average_workload", "average_workload_n"),
        ("average_rating_same_professors", "average_rating_same_professors_n"),
        ("average_workload_same_professors",
         "average_workload_same_professors_n"),
    ]:
        courses[average_col], courses[num_col] = zip(
            *courses[average_col].apply(average))

    # remove intermediate columns
    courses = courses.loc[:, get_table_columns(database.models.Course)]

    return courses
示例#2
0
def import_evaluations(
    evaluation_narratives: pd.DataFrame,
    evaluation_ratings: pd.DataFrame,
    evaluation_statistics: pd.DataFrame,
    evaluation_questions: pd.DataFrame,
    listings: pd.DataFrame,
) -> Tuple[pd.DataFrame, ...]:
    """
    Import course evaluations into Pandas DataFrame.

    Parameters
    ----------
    evaluation_narratives:
        Table of narratives from /ferry/crawler/parse_ratings.py.
    evaluation_ratings:
        Table of ratings from /ferry/crawler/parse_ratings.py.
    evaluation_statistics:
        Table of statistics from /ferry/crawler/parse_ratings.py.
    evaluation_questions:
        Table of questions from /ferry/crawler/parse_ratings.py.
    listings:
        Table of listings from import_courses.

    Returns
    -------
    evaluation_narratives,
    evaluation_ratings,
    evaluation_statistics,
    evaluation_questions
    """
    (
        evaluation_statistics,
        evaluation_narratives,
        evaluation_ratings,
    ) = match_evaluations_to_courses(
        evaluation_narratives,
        evaluation_ratings,
        evaluation_statistics,
        listings,
    )

    # -------------------
    # Aggregate questions
    # -------------------

    # consistency checks
    print("Checking question text consistency")
    text_by_code = evaluation_questions.groupby(
        "question_code")[  # type: ignore
            "question_text"].apply(set)

    # focus on question texts with multiple variations
    text_by_code = text_by_code[text_by_code.apply(len) > 1]

    def amend_texts(texts: set) -> set:
        """
        Remove extraneous texts.

        Parameters
        ----------
        texts:
            Set of texts to amend.
        """

        for remove_text in REMOVE_TEXTS:

            texts = {text.replace(remove_text, "") for text in texts}

        return texts

    text_by_code = text_by_code.apply(amend_texts)

    # add [0] at the end to account for empty lists
    max_diff_texts = max(list(text_by_code.apply(len)) + [0])
    print(
        f"Maximum number of different texts per question code: {max_diff_texts}"
    )

    # get the maximum distance between a set of texts
    def max_pairwise_distance(texts):

        pairs = combinations(texts, 2)
        distances = [
            textdistance.levenshtein.distance(*pair) for pair in pairs
        ]

        return max(distances)

    distances_by_code = text_by_code.apply(max_pairwise_distance)
    # add [0] at the end to account for empty lists
    max_all_distances = max(list(distances_by_code) + [0])

    print(f"Maximum text divergence within codes: {max_all_distances}")

    if not all(distances_by_code < QUESTION_DIVERGENCE_CUTOFF):

        inconsistent_codes = distances_by_code[
            distances_by_code >= QUESTION_DIVERGENCE_CUTOFF]
        inconsistent_codes = list(inconsistent_codes.index)
        inconsistent_codes = ", ".join(inconsistent_codes)

        raise database.InvariantError(
            f"Error: question codes {inconsistent_codes} have divergent texts")

    print("Checking question type (narrative/rating) consistency")
    is_narrative_by_code = evaluation_questions.groupby(  # type: ignore
        "question_code")["is_narrative"].apply(set)

    # check that a question code is always narrative or always rating
    if not all(is_narrative_by_code.apply(len) == 1):
        inconsistent_codes = is_narrative_by_code[
            is_narrative_by_code.apply(len) != 1]
        inconsistent_codes = list(inconsistent_codes.index)
        inconsistent_codes = ", ".join(inconsistent_codes)
        raise database.InvariantError(
            f"Error: question codes {inconsistent_codes} have both narratives and ratings"
        )

    # deduplicate questions and keep most recent
    evaluation_questions = evaluation_questions.sort_values(by="season",
                                                            ascending=False)
    evaluation_questions.drop_duplicates(  # type: ignore
        subset=["question_code"], keep="first", inplace=True)

    evaluation_questions["options"] = evaluation_questions["options"].replace(
        "NaN", "[]")

    # -------------------
    # Clean up and subset
    # -------------------

    # evaluation narratives ----------------

    # filter out missing or short comments
    evaluation_narratives.dropna(subset=["comment"], inplace=True)

    # MIN_COMMENT_LENGTH = 2
    evaluation_narratives = evaluation_narratives.loc[
        evaluation_narratives["comment"].apply(len) > 2]
    # replace carriage returns for csv-based migration
    evaluation_narratives.loc[:, "comment"] = evaluation_narratives[
        "comment"].apply(lambda x: x.replace("\r", ""))
    # id column for database primary key
    evaluation_narratives.loc[:,
                              "id"] = list(range(len(evaluation_narratives)))
    evaluation_narratives.reset_index(drop=True, inplace=True)

    # evaluation ratings ----------------

    # id column for database primary key
    evaluation_ratings.loc[:, "id"] = list(range(len(evaluation_ratings)))
    evaluation_ratings.reset_index(drop=True, inplace=True)

    # evaluation questions ----------------

    # tag to be added later
    evaluation_questions["tag"] = ""
    evaluation_questions.reset_index(drop=True, inplace=True)

    # evaluation statistics ----------------

    # explicitly specify missing columns to be filled in later
    evaluation_statistics[["avg_rating", "avg_workload",
                           "enrollment"]] = np.nan
    # convert to JSON string for postgres
    evaluation_statistics.loc[:, "extras"] = evaluation_statistics[
        "extras"].apply(ujson.dumps)
    evaluation_statistics.reset_index(drop=True, inplace=True)

    # extract columns to match database  ----------------
    evaluation_narratives = evaluation_narratives.loc[:,
                                                      get_table_columns(
                                                          database.models.
                                                          EvaluationNarrative)]
    evaluation_ratings = evaluation_ratings.loc[:,
                                                get_table_columns(
                                                    database.models.
                                                    EvaluationRating)]
    evaluation_statistics = evaluation_statistics.loc[:,
                                                      get_table_columns(
                                                          database.models.
                                                          EvaluationStatistics
                                                      )]
    evaluation_questions = evaluation_questions.loc[:,
                                                    get_table_columns(
                                                        database.models.
                                                        EvaluationQuestion)]

    print("[Summary]")
    print(f"Total evaluation narratives: {len(evaluation_narratives)}")
    print(f"Total evaluation ratings: {len(evaluation_ratings)}")
    print(f"Total evaluation statistics: {len(evaluation_statistics)}")
    print(f"Total evaluation questions: {len(evaluation_questions)}")

    return (
        evaluation_narratives,
        evaluation_ratings,
        evaluation_statistics,
        evaluation_questions,
    )
示例#3
0
def import_discussions(
        merged_discussions_info: pd.DataFrame,
        listings: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Import discussion sections into Pandas DataFrame.

    Parameters
    ----------
    merged_discussions_info:
        Parsed discussion sections information from CSV files.
    listings:
        Listings table from import_courses.

    Returns
    -------
    discussions
    """
    discussions = merged_discussions_info.copy(deep=True)
    discussions["discussion_id"] = range(len(discussions))

    # serialize objects for loading
    discussions["times_by_day"] = discussions["times_by_day"].apply(
        ujson.dumps)

    # construct outer season grouping
    season_code_to_course_id = listings[[
        "season_code", "course_code", "course_id"
    ]].groupby("season_code")

    # construct inner course_code to course_id mapping
    season_code_to_course_id = season_code_to_course_id.apply(  # type: ignore
        lambda x: x[["course_code", "course_id"]].groupby("course_code")[
            "course_id"].apply(list).to_dict())

    # cast outer season mapping to dictionary
    season_code_to_course_id = season_code_to_course_id.to_dict(
    )  # type: ignore

    def get_course_code(row):
        """
        Formats the course code for course ID matching.
        """
        if row["subject"] != "" and row["number"] != "":
            # remove the 'D' at the end of the code for matching
            return row["subject"] + " " + row["number"][:-1]
        return ""

    discussions["course_code"] = discussions.apply(get_course_code, axis=1)

    def match_discussion_to_courses(row):
        """
        Matches discussion section course code to course ID (in 'courses' table).
        """
        season_code = int(row["season_code"])

        # get matching course IDs by season and section code
        # (assumes section code is just course code + "D" suffix)
        course_ids = season_code_to_course_id.get(season_code, {}).get(
            row["course_code"], [])
        course_ids = sorted(list(course_ids))

        return course_ids

    discussions["course_ids"] = discussions.apply(match_discussion_to_courses,
                                                  axis=1)

    # create course_discussions junction table
    course_discussions = discussions.loc[:, ["course_ids", "discussion_id"
                                             ]].explode(  # type: ignore
                                                 "course_ids")
    course_discussions = course_discussions.rename(
        columns={"course_ids": "course_id"})
    course_discussions.dropna(subset=["course_id"], inplace=True)
    course_discussions.loc[:, "course_id"] = course_discussions[
        "course_id"].astype(int)

    # subset for actual columns used in Postgres
    course_discussions = course_discussions.loc[:,
                                                get_table_columns(
                                                    database.models.
                                                    course_discussions,
                                                    not_class=True)]
    discussions = discussions.loc[:,
                                  get_table_columns(database.models.Discussion
                                                    )]

    return discussions, course_discussions
示例#4
0
def import_demand(merged_demand_info: pd.DataFrame,
                  listings: pd.DataFrame) -> pd.DataFrame:
    """
    Import demand statistics into Pandas DataFrame.

    Parameters
    ----------
    merged_demand_info:
        Raw demand information from JSON files.
    listings:
        Listings table from import_courses.

    Returns
    -------
    demand_statistics
    """
    demand_statistics = merged_demand_info.copy(deep=True)

    # construct outer season grouping
    season_code_to_course_id = listings[[
        "season_code", "course_code", "course_id"
    ]].groupby("season_code")

    # construct inner course_code to course_id mapping
    season_code_to_course_id = season_code_to_course_id.apply(  # type: ignore
        lambda x: x[["course_code", "course_id"]].groupby("course_code")[
            "course_id"].apply(list).to_dict())

    # cast outer season mapping to dictionary
    season_code_to_course_id = season_code_to_course_id.to_dict(
    )  # type: ignore

    def match_demand_to_courses(row):
        season_code = int(row["season_code"])

        course_ids = [
            season_code_to_course_id.get(season_code, {}).get(x, None)
            for x in row["codes"]
        ]

        course_ids = [set(x) for x in course_ids if x is not None]

        if course_ids == []:
            return []

        # union all course IDs
        course_ids = set.union(*course_ids)
        course_ids = sorted(list(course_ids))

        return course_ids

    demand_statistics["course_id"] = demand_statistics.apply(
        match_demand_to_courses, axis=1)

    demand_statistics = demand_statistics.loc[  # type: ignore
        demand_statistics["course_id"].apply(len) > 0, :]

    def date_to_int(date):
        month, day = date.split("/")

        month = int(month)
        day = int(day)

        return month * 100 + day

    def get_most_recent_demand(row):

        sorted_demand = list(row["overall_demand"].items())
        sorted_demand.sort(key=lambda x: date_to_int(x[0]))
        latest_demand_date, latest_demand = sorted_demand[-1]

        return [latest_demand, latest_demand_date]

    # get most recent demand date
    demand_statistics["latest_demand"], demand_statistics[
        "latest_demand_date"] = zip(
            *demand_statistics.apply(get_most_recent_demand, axis=1))

    # expand course_id list to one per row
    demand_statistics = demand_statistics.explode("course_id")  # type: ignore
    demand_statistics.drop_duplicates(  # type: ignore
        subset=["course_id"], keep="first", inplace=True)

    # rename demand JSON column to match database
    demand_statistics = demand_statistics.rename({"overall_demand": "demand"},
                                                 axis="columns")

    demand_statistics["demand"] = demand_statistics["demand"].apply(
        ujson.dumps)

    # extract columns to match database
    demand_statistics = demand_statistics.loc[:,
                                              get_table_columns(
                                                  database.models.
                                                  DemandStatistics)]

    print("[Summary]")
    print(f"Total demand statistics: {len(demand_statistics)}")

    return demand_statistics
示例#5
0
def import_courses(merged_course_info: pd.DataFrame,
                   seasons: List[str]) -> Tuple[pd.DataFrame, ...]:
    """
    Import courses into Pandas DataFrames.

    Parameters
    ----------
    merged_course_info:
        Raw course information from JSON files.
    seasons:
        List of seasons for sorting purposes.

    Returns
    -------
    courses, listings, course_professors, professors
    """
    merged_course_info = resolve_cross_listings(merged_course_info)

    print("Creating courses table")
    # initialize courses table
    courses = merged_course_info.drop_duplicates(  # type: ignore
        subset="temp_course_id").copy(deep=True)
    # global course IDs
    courses["course_id"] = range(len(courses))
    # convert to JSON string for postgres
    courses["areas"] = courses["areas"].apply(ujson.dumps)
    courses["times_by_day"] = courses["times_by_day"].apply(ujson.dumps)
    courses["skills"] = courses["skills"].apply(ujson.dumps)
    # replace carriage returns for tsv-based migration
    courses["description"] = courses["description"].apply(
        lambda x: x.replace("\r", ""))
    courses["title"] = courses["title"].apply(lambda x: x.replace("\r", ""))
    courses["short_title"] = courses["short_title"].apply(
        lambda x: x.replace("\r", ""))
    courses["requirements"] = courses["requirements"].apply(
        lambda x: x.replace("\r", ""))

    print("Creating listings table")
    # map temporary season-specific IDs to global course IDs
    temp_to_course_id = dict(
        zip(courses["temp_course_id"], courses["course_id"]))

    # initialize listings table
    listings = merged_course_info.copy(deep=True)
    listings["listing_id"] = range(len(listings))
    listings["course_id"] = listings["temp_course_id"].apply(
        temp_to_course_id.get)
    listings["section"] = listings["section"].apply(lambda x: "0"
                                                    if x is None else x)
    listings["section"] = listings["section"].fillna("0").astype(
        str)  # type: ignore
    listings["section"] = listings["section"].replace({"":
                                                       "0"})  # type: ignore

    professors_prep = aggregate_professors(courses)

    professors, course_professors = resolve_professors(professors_prep,
                                                       seasons)

    # explicitly specify missing columns to be filled in later
    courses[[
        "location_times",
        "average_rating",
        "average_rating_n",
        "average_workload",
        "average_workload_n",
        "average_rating_same_professors",
        "average_rating_same_professors_n",
        "average_workload_same_professors",
        "average_workload_same_professors_n",
        "same_course_id",
        "same_course_and_profs_id",
        "last_offered_course_id",
        "last_enrollment_course_id",
        "last_enrollment",
        "last_enrollment_season_code",
        "last_enrollment_same_professors",
    ]] = np.nan
    professors[["average_rating", "average_rating_n"]] = np.nan

    # construct courses and flags mapping
    print("Adding course flags")
    course_flags = courses[["course_id", "flags"]].copy(deep=True)
    course_flags = course_flags[course_flags["flags"].apply(len) > 0]
    course_flags = course_flags.explode("flags")  # type: ignore

    flags = course_flags.drop_duplicates(  # type: ignore
        subset=["flags"], keep="first").copy(deep=True)
    flags["flag_text"] = flags["flags"]
    flags["flag_id"] = range(len(flags))

    flag_text_to_id = dict(zip(flags["flag_text"], flags["flag_id"]))
    course_flags["flag_id"] = course_flags["flags"].apply(flag_text_to_id.get)

    # extract columns to match database
    courses = courses.loc[:, get_table_columns(database.models.Course)]
    listings = listings.loc[:, get_table_columns(database.models.Listing)]
    course_professors = course_professors.loc[:,
                                              get_table_columns(
                                                  database.models.
                                                  course_professors,
                                                  not_class=True)]
    professors = professors.loc[:,
                                get_table_columns(database.models.Professor)]
    flags = flags.loc[:, get_table_columns(database.models.Flag)]
    course_flags = course_flags.loc[:,
                                    get_table_columns(database.models.
                                                      course_flags,
                                                      not_class=True)]

    print("[Summary]")
    print(f"Total courses: {len(courses)}")
    print(f"Total listings: {len(listings)}")
    print(f"Total course-professors: {len(course_professors)}")
    print(f"Total professors: {len(professors)}")
    print(f"Total course-flags: {len(course_flags)}")
    print(f"Total flags: {len(flags)}")

    return courses, listings, course_professors, professors, course_flags, flags