def courses_computed( courses: pd.DataFrame, listings: pd.DataFrame, evaluation_statistics: pd.DataFrame, course_professors: pd.DataFrame, ) -> pd.DataFrame: """ Populates computed course rating fields: average_rating: Average course rating over all past instances. average_workload: Average course workload over all past instances. Also populates last-offered course fields: last_offered_course_id: course_id of the most recent previous offering. last_enrollment_course_id: course_id of the most recent previous offering with enrollment statistics. last_enrollment: Number of students in most recent previous offering with enrollment statistics. last_enrollment_season_code: Season of recent previous offering with enrollment statistics. last_enrollment_same_professors: If recent previous offering with enrollment statistics was with same professors. Parameters ---------- Pandas tables post-import: courses listings evaluation_statistics course_professors Returns ------- courses: Table with computed fields. """ listings = listings.copy(deep=True) evaluation_statistics = evaluation_statistics.copy(deep=True) course_professors = course_professors.copy(deep=True) ( course_to_same_course, same_course_to_courses, course_to_same_course_filtered, same_course_to_courses_filtered, ) = resolve_historical_courses(courses, listings) # partition ID of same-codes courses (not used anymore, useful for debugging) courses["shared_code_id"] = courses["course_id"].apply( course_to_same_course.get) # connected courses with the same code (not used anymore, useful for debugging) courses["shared_code_courses"] = courses["shared_code_id"].apply( same_course_to_courses.get) # unique ID for each partition of the same courses courses["same_course_id"] = courses["course_id"].apply( course_to_same_course_filtered.get) # list of course_ids that are the same course per course_id courses["same_courses"] = courses["same_course_id"].apply( same_course_to_courses_filtered.get) # split same-course partition by same-professors course_to_same_prof_course, same_prof_course_to_courses = split_same_professors( course_to_same_course_filtered, course_professors) # unique ID for each partition of the same courses taught by the same set of profs courses["same_course_and_profs_id"] = courses["course_id"].apply( course_to_same_prof_course.get) # list of course_ids that are the same course and taught by same profs per course_id courses["same_courses_and_profs"] = courses[ "same_course_and_profs_id"].apply(same_prof_course_to_courses.get) # map course_id to professor_ids # use frozenset because it is hashable (set is not), needed for groupby course_to_professors = course_professors.groupby( "course_id")[ # type: ignore "professor_id"].apply(frozenset) # get historical offerings with same professors listings["professors"] = listings["course_id"].apply( course_to_professors.get) courses["professors"] = courses["course_id"].apply( course_to_professors.get) print("Computing last offering statistics") # course_id for all evaluated courses evaluated_courses = set( evaluation_statistics.dropna(subset=["enrolled"], axis=0)["course_id"]) # map course_id to season course_to_season = dict(zip(courses["course_id"], courses["season_code"])) # map course_id to number enrolled course_to_enrollment = dict( zip(evaluation_statistics["course_id"], evaluation_statistics["enrolled"])) # get last course offering in general (with or without enrollment) def get_last_offered(course_row): same_courses = course_row["same_courses"] same_courses = [ x for x in same_courses if course_to_season[x] < course_row["season_code"] ] if len(same_courses) == 0: return None same_courses = [ x for x in same_courses if x is not course_row["course_id"] ] if len(same_courses) == 0: return None last_offered_course = max(same_courses, key=lambda x: course_to_season[x]) return last_offered_course # helper function for getting enrollment fields of last-offered course def get_last_offered_enrollment(course_row): same_courses = course_row["same_courses"] # keep course only if distinct, has enrollment statistics, and is before current same_courses = [ x for x in same_courses if x in evaluated_courses and course_to_season[x] < course_row["season_code"] ] if len(same_courses) == 0: return [None, None, None, None] same_courses = [ x for x in same_courses if x is not course_row["course_id"] ] if len(same_courses) == 0: return [None, None, None, None] current_professors = course_to_professors.get(course_row["course_id"], set()) # sort courses newest-first same_courses = sorted(same_courses, key=lambda x: course_to_season[x], reverse=True) # get the newest course with the same professors, otherwise just the newest course last_enrollment_course = next( (prev_course for prev_course in same_courses if course_to_professors.get( prev_course, set()) == current_professors), # default to newest course if no previous course has same profs same_courses[0], ) # number of students last taking course last_enrollment = course_to_enrollment[last_enrollment_course] # season for last enrollment last_enrollment_season = course_to_season[last_enrollment_course] # professors for last enrollment last_enrollment_professors = course_to_professors.get( last_enrollment_course, set()) # if last enrollment is with same professors last_enrollment_same_professors = ( last_enrollment_professors == current_professors) return ( last_enrollment_course, last_enrollment, last_enrollment_season, last_enrollment_same_professors, ) tqdm.pandas(desc="Finding last-offered course") courses["last_offered_course_id"] = courses.progress_apply( # type: ignore get_last_offered, axis=1) tqdm.pandas(desc="Finding last-offered enrollment") # getting last-offered enrollment ( courses["last_enrollment_course_id"], courses["last_enrollment"], courses["last_enrollment_season_code"], courses["last_enrollment_same_professors"], ) = zip(*courses.progress_apply(get_last_offered_enrollment, axis=1) # type: ignore ) print("Computing historical ratings for courses") # map courses to ratings course_to_overall = dict( zip(evaluation_statistics["course_id"], evaluation_statistics["avg_rating"])) course_to_workload = dict( zip(evaluation_statistics["course_id"], evaluation_statistics["avg_workload"])) # get ratings courses["average_rating"] = courses["same_courses"].apply( lambda courses: [course_to_overall.get(x) for x in courses]) courses["average_workload"] = courses["same_courses"].apply( lambda courses: [course_to_workload.get(x) for x in courses]) courses["average_rating_same_professors"] = courses[ "same_courses_and_profs"].apply( lambda courses: [course_to_overall.get(x) for x in courses]) courses["average_workload_same_professors"] = courses[ "same_courses_and_profs"].apply( lambda courses: [course_to_workload.get(x) for x in courses]) # calculate the average of an array def average(nums): nums = list(filter(lambda x: x is not None, nums)) nums = list(filter(lambda x: not math.isnan(x), nums)) if not nums: return [None, None] num_obs = len(nums) return (sum(nums) / num_obs, num_obs) # calculate averages over past offerings for average_col, num_col in [ ("average_rating", "average_rating_n"), ("average_workload", "average_workload_n"), ("average_rating_same_professors", "average_rating_same_professors_n"), ("average_workload_same_professors", "average_workload_same_professors_n"), ]: courses[average_col], courses[num_col] = zip( *courses[average_col].apply(average)) # remove intermediate columns courses = courses.loc[:, get_table_columns(database.models.Course)] return courses
def import_evaluations( evaluation_narratives: pd.DataFrame, evaluation_ratings: pd.DataFrame, evaluation_statistics: pd.DataFrame, evaluation_questions: pd.DataFrame, listings: pd.DataFrame, ) -> Tuple[pd.DataFrame, ...]: """ Import course evaluations into Pandas DataFrame. Parameters ---------- evaluation_narratives: Table of narratives from /ferry/crawler/parse_ratings.py. evaluation_ratings: Table of ratings from /ferry/crawler/parse_ratings.py. evaluation_statistics: Table of statistics from /ferry/crawler/parse_ratings.py. evaluation_questions: Table of questions from /ferry/crawler/parse_ratings.py. listings: Table of listings from import_courses. Returns ------- evaluation_narratives, evaluation_ratings, evaluation_statistics, evaluation_questions """ ( evaluation_statistics, evaluation_narratives, evaluation_ratings, ) = match_evaluations_to_courses( evaluation_narratives, evaluation_ratings, evaluation_statistics, listings, ) # ------------------- # Aggregate questions # ------------------- # consistency checks print("Checking question text consistency") text_by_code = evaluation_questions.groupby( "question_code")[ # type: ignore "question_text"].apply(set) # focus on question texts with multiple variations text_by_code = text_by_code[text_by_code.apply(len) > 1] def amend_texts(texts: set) -> set: """ Remove extraneous texts. Parameters ---------- texts: Set of texts to amend. """ for remove_text in REMOVE_TEXTS: texts = {text.replace(remove_text, "") for text in texts} return texts text_by_code = text_by_code.apply(amend_texts) # add [0] at the end to account for empty lists max_diff_texts = max(list(text_by_code.apply(len)) + [0]) print( f"Maximum number of different texts per question code: {max_diff_texts}" ) # get the maximum distance between a set of texts def max_pairwise_distance(texts): pairs = combinations(texts, 2) distances = [ textdistance.levenshtein.distance(*pair) for pair in pairs ] return max(distances) distances_by_code = text_by_code.apply(max_pairwise_distance) # add [0] at the end to account for empty lists max_all_distances = max(list(distances_by_code) + [0]) print(f"Maximum text divergence within codes: {max_all_distances}") if not all(distances_by_code < QUESTION_DIVERGENCE_CUTOFF): inconsistent_codes = distances_by_code[ distances_by_code >= QUESTION_DIVERGENCE_CUTOFF] inconsistent_codes = list(inconsistent_codes.index) inconsistent_codes = ", ".join(inconsistent_codes) raise database.InvariantError( f"Error: question codes {inconsistent_codes} have divergent texts") print("Checking question type (narrative/rating) consistency") is_narrative_by_code = evaluation_questions.groupby( # type: ignore "question_code")["is_narrative"].apply(set) # check that a question code is always narrative or always rating if not all(is_narrative_by_code.apply(len) == 1): inconsistent_codes = is_narrative_by_code[ is_narrative_by_code.apply(len) != 1] inconsistent_codes = list(inconsistent_codes.index) inconsistent_codes = ", ".join(inconsistent_codes) raise database.InvariantError( f"Error: question codes {inconsistent_codes} have both narratives and ratings" ) # deduplicate questions and keep most recent evaluation_questions = evaluation_questions.sort_values(by="season", ascending=False) evaluation_questions.drop_duplicates( # type: ignore subset=["question_code"], keep="first", inplace=True) evaluation_questions["options"] = evaluation_questions["options"].replace( "NaN", "[]") # ------------------- # Clean up and subset # ------------------- # evaluation narratives ---------------- # filter out missing or short comments evaluation_narratives.dropna(subset=["comment"], inplace=True) # MIN_COMMENT_LENGTH = 2 evaluation_narratives = evaluation_narratives.loc[ evaluation_narratives["comment"].apply(len) > 2] # replace carriage returns for csv-based migration evaluation_narratives.loc[:, "comment"] = evaluation_narratives[ "comment"].apply(lambda x: x.replace("\r", "")) # id column for database primary key evaluation_narratives.loc[:, "id"] = list(range(len(evaluation_narratives))) evaluation_narratives.reset_index(drop=True, inplace=True) # evaluation ratings ---------------- # id column for database primary key evaluation_ratings.loc[:, "id"] = list(range(len(evaluation_ratings))) evaluation_ratings.reset_index(drop=True, inplace=True) # evaluation questions ---------------- # tag to be added later evaluation_questions["tag"] = "" evaluation_questions.reset_index(drop=True, inplace=True) # evaluation statistics ---------------- # explicitly specify missing columns to be filled in later evaluation_statistics[["avg_rating", "avg_workload", "enrollment"]] = np.nan # convert to JSON string for postgres evaluation_statistics.loc[:, "extras"] = evaluation_statistics[ "extras"].apply(ujson.dumps) evaluation_statistics.reset_index(drop=True, inplace=True) # extract columns to match database ---------------- evaluation_narratives = evaluation_narratives.loc[:, get_table_columns( database.models. EvaluationNarrative)] evaluation_ratings = evaluation_ratings.loc[:, get_table_columns( database.models. EvaluationRating)] evaluation_statistics = evaluation_statistics.loc[:, get_table_columns( database.models. EvaluationStatistics )] evaluation_questions = evaluation_questions.loc[:, get_table_columns( database.models. EvaluationQuestion)] print("[Summary]") print(f"Total evaluation narratives: {len(evaluation_narratives)}") print(f"Total evaluation ratings: {len(evaluation_ratings)}") print(f"Total evaluation statistics: {len(evaluation_statistics)}") print(f"Total evaluation questions: {len(evaluation_questions)}") return ( evaluation_narratives, evaluation_ratings, evaluation_statistics, evaluation_questions, )
def import_discussions( merged_discussions_info: pd.DataFrame, listings: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Import discussion sections into Pandas DataFrame. Parameters ---------- merged_discussions_info: Parsed discussion sections information from CSV files. listings: Listings table from import_courses. Returns ------- discussions """ discussions = merged_discussions_info.copy(deep=True) discussions["discussion_id"] = range(len(discussions)) # serialize objects for loading discussions["times_by_day"] = discussions["times_by_day"].apply( ujson.dumps) # construct outer season grouping season_code_to_course_id = listings[[ "season_code", "course_code", "course_id" ]].groupby("season_code") # construct inner course_code to course_id mapping season_code_to_course_id = season_code_to_course_id.apply( # type: ignore lambda x: x[["course_code", "course_id"]].groupby("course_code")[ "course_id"].apply(list).to_dict()) # cast outer season mapping to dictionary season_code_to_course_id = season_code_to_course_id.to_dict( ) # type: ignore def get_course_code(row): """ Formats the course code for course ID matching. """ if row["subject"] != "" and row["number"] != "": # remove the 'D' at the end of the code for matching return row["subject"] + " " + row["number"][:-1] return "" discussions["course_code"] = discussions.apply(get_course_code, axis=1) def match_discussion_to_courses(row): """ Matches discussion section course code to course ID (in 'courses' table). """ season_code = int(row["season_code"]) # get matching course IDs by season and section code # (assumes section code is just course code + "D" suffix) course_ids = season_code_to_course_id.get(season_code, {}).get( row["course_code"], []) course_ids = sorted(list(course_ids)) return course_ids discussions["course_ids"] = discussions.apply(match_discussion_to_courses, axis=1) # create course_discussions junction table course_discussions = discussions.loc[:, ["course_ids", "discussion_id" ]].explode( # type: ignore "course_ids") course_discussions = course_discussions.rename( columns={"course_ids": "course_id"}) course_discussions.dropna(subset=["course_id"], inplace=True) course_discussions.loc[:, "course_id"] = course_discussions[ "course_id"].astype(int) # subset for actual columns used in Postgres course_discussions = course_discussions.loc[:, get_table_columns( database.models. course_discussions, not_class=True)] discussions = discussions.loc[:, get_table_columns(database.models.Discussion )] return discussions, course_discussions
def import_demand(merged_demand_info: pd.DataFrame, listings: pd.DataFrame) -> pd.DataFrame: """ Import demand statistics into Pandas DataFrame. Parameters ---------- merged_demand_info: Raw demand information from JSON files. listings: Listings table from import_courses. Returns ------- demand_statistics """ demand_statistics = merged_demand_info.copy(deep=True) # construct outer season grouping season_code_to_course_id = listings[[ "season_code", "course_code", "course_id" ]].groupby("season_code") # construct inner course_code to course_id mapping season_code_to_course_id = season_code_to_course_id.apply( # type: ignore lambda x: x[["course_code", "course_id"]].groupby("course_code")[ "course_id"].apply(list).to_dict()) # cast outer season mapping to dictionary season_code_to_course_id = season_code_to_course_id.to_dict( ) # type: ignore def match_demand_to_courses(row): season_code = int(row["season_code"]) course_ids = [ season_code_to_course_id.get(season_code, {}).get(x, None) for x in row["codes"] ] course_ids = [set(x) for x in course_ids if x is not None] if course_ids == []: return [] # union all course IDs course_ids = set.union(*course_ids) course_ids = sorted(list(course_ids)) return course_ids demand_statistics["course_id"] = demand_statistics.apply( match_demand_to_courses, axis=1) demand_statistics = demand_statistics.loc[ # type: ignore demand_statistics["course_id"].apply(len) > 0, :] def date_to_int(date): month, day = date.split("/") month = int(month) day = int(day) return month * 100 + day def get_most_recent_demand(row): sorted_demand = list(row["overall_demand"].items()) sorted_demand.sort(key=lambda x: date_to_int(x[0])) latest_demand_date, latest_demand = sorted_demand[-1] return [latest_demand, latest_demand_date] # get most recent demand date demand_statistics["latest_demand"], demand_statistics[ "latest_demand_date"] = zip( *demand_statistics.apply(get_most_recent_demand, axis=1)) # expand course_id list to one per row demand_statistics = demand_statistics.explode("course_id") # type: ignore demand_statistics.drop_duplicates( # type: ignore subset=["course_id"], keep="first", inplace=True) # rename demand JSON column to match database demand_statistics = demand_statistics.rename({"overall_demand": "demand"}, axis="columns") demand_statistics["demand"] = demand_statistics["demand"].apply( ujson.dumps) # extract columns to match database demand_statistics = demand_statistics.loc[:, get_table_columns( database.models. DemandStatistics)] print("[Summary]") print(f"Total demand statistics: {len(demand_statistics)}") return demand_statistics
def import_courses(merged_course_info: pd.DataFrame, seasons: List[str]) -> Tuple[pd.DataFrame, ...]: """ Import courses into Pandas DataFrames. Parameters ---------- merged_course_info: Raw course information from JSON files. seasons: List of seasons for sorting purposes. Returns ------- courses, listings, course_professors, professors """ merged_course_info = resolve_cross_listings(merged_course_info) print("Creating courses table") # initialize courses table courses = merged_course_info.drop_duplicates( # type: ignore subset="temp_course_id").copy(deep=True) # global course IDs courses["course_id"] = range(len(courses)) # convert to JSON string for postgres courses["areas"] = courses["areas"].apply(ujson.dumps) courses["times_by_day"] = courses["times_by_day"].apply(ujson.dumps) courses["skills"] = courses["skills"].apply(ujson.dumps) # replace carriage returns for tsv-based migration courses["description"] = courses["description"].apply( lambda x: x.replace("\r", "")) courses["title"] = courses["title"].apply(lambda x: x.replace("\r", "")) courses["short_title"] = courses["short_title"].apply( lambda x: x.replace("\r", "")) courses["requirements"] = courses["requirements"].apply( lambda x: x.replace("\r", "")) print("Creating listings table") # map temporary season-specific IDs to global course IDs temp_to_course_id = dict( zip(courses["temp_course_id"], courses["course_id"])) # initialize listings table listings = merged_course_info.copy(deep=True) listings["listing_id"] = range(len(listings)) listings["course_id"] = listings["temp_course_id"].apply( temp_to_course_id.get) listings["section"] = listings["section"].apply(lambda x: "0" if x is None else x) listings["section"] = listings["section"].fillna("0").astype( str) # type: ignore listings["section"] = listings["section"].replace({"": "0"}) # type: ignore professors_prep = aggregate_professors(courses) professors, course_professors = resolve_professors(professors_prep, seasons) # explicitly specify missing columns to be filled in later courses[[ "location_times", "average_rating", "average_rating_n", "average_workload", "average_workload_n", "average_rating_same_professors", "average_rating_same_professors_n", "average_workload_same_professors", "average_workload_same_professors_n", "same_course_id", "same_course_and_profs_id", "last_offered_course_id", "last_enrollment_course_id", "last_enrollment", "last_enrollment_season_code", "last_enrollment_same_professors", ]] = np.nan professors[["average_rating", "average_rating_n"]] = np.nan # construct courses and flags mapping print("Adding course flags") course_flags = courses[["course_id", "flags"]].copy(deep=True) course_flags = course_flags[course_flags["flags"].apply(len) > 0] course_flags = course_flags.explode("flags") # type: ignore flags = course_flags.drop_duplicates( # type: ignore subset=["flags"], keep="first").copy(deep=True) flags["flag_text"] = flags["flags"] flags["flag_id"] = range(len(flags)) flag_text_to_id = dict(zip(flags["flag_text"], flags["flag_id"])) course_flags["flag_id"] = course_flags["flags"].apply(flag_text_to_id.get) # extract columns to match database courses = courses.loc[:, get_table_columns(database.models.Course)] listings = listings.loc[:, get_table_columns(database.models.Listing)] course_professors = course_professors.loc[:, get_table_columns( database.models. course_professors, not_class=True)] professors = professors.loc[:, get_table_columns(database.models.Professor)] flags = flags.loc[:, get_table_columns(database.models.Flag)] course_flags = course_flags.loc[:, get_table_columns(database.models. course_flags, not_class=True)] print("[Summary]") print(f"Total courses: {len(courses)}") print(f"Total listings: {len(listings)}") print(f"Total course-professors: {len(course_professors)}") print(f"Total professors: {len(professors)}") print(f"Total course-flags: {len(course_flags)}") print(f"Total flags: {len(flags)}") return courses, listings, course_professors, professors, course_flags, flags