def question_tag_invariant(session: sqlalchemy.orm.session.Session): """ Check question tag invariants. Check invariant: all questions sharing a tag also share is_narrative and len(options) """ # Dictionary of question_code -> (is_narrative, len(options)) tag_cache = {} def optlen(options): return len(options) if options else -1 for question in session.query( database.EvaluationQuestion): # type: database.EvaluationQuestion if not question.tag: continue if question.tag not in tag_cache: tag_cache[question.tag] = (question.is_narrative, optlen(question.options)) else: narrative, count = tag_cache[question.tag] if question.is_narrative != narrative or count != optlen( question.options): raise database.InvariantError(f"mismatched tag {question.tag}")
def question_invariants(session: sqlalchemy.orm.session.Session): """ Check question invariants. Check invariant: evaluation_questions.options is null iff evaluation_questions.is_narrative = True """ for question in session.query( database.EvaluationQuestion): # type: database.EvaluationQuestion narrative = question.is_narrative options = bool(question.options) if narrative and options: raise database.InvariantError( f"narrative question {question} has options") if not narrative and not options: raise database.InvariantError( f"ratings question {question} lacks options")
def listing_invariants(session: sqlalchemy.orm.session.Session): """ Check listing invariants. Check invariant: listing.season_code == course.season_code if listing.course_id == course.course_id. """ for listing_id, course_id, listing_season_code, course_season_code in session.query( database.Listing.listing_id, database.Listing.course_id, database.Listing.season_code, database.Course.season_code, ).filter(database.Listing.course_id == database.Course.course_id): if listing_season_code != course_season_code: raise database.InvariantError( f"listing {listing_id} has mismatched season_code with course {course_id}" )
def assign_code(row): code = row["question_code"] # Remove these suffixes for tag resolution. strip_suffixes = ["-YCWR", "-YXWR", "-SA"] for suffix in strip_suffixes: if code.endswith(suffix): code = code[:-len(suffix)] break # Set the appropriate question tag. try: return QUESTION_TAGS[code] except KeyError as err: raise database.InvariantError( f"No associated tag for question code {code} with text {row['question_text']}" ) from err
def course_invariants(session: sqlalchemy.orm.session.Session): """ Check course invariants. Check invariant: every course should have at least one listing. """ courses_no_listings = (session.query(database.Course).select_from( database.Listing).join(database.Listing.course, isouter=True).group_by( database.Course.course_id).having( sqlalchemy.func.count(database.Listing.listing_id) == 0) ).all() if courses_no_listings: no_listing_courses = [str(course) for course in courses_no_listings] raise database.InvariantError( f"the following courses have no listings: {', '.join(no_listing_courses)}" )
def average_by_course(question_tag, n_categories): tagged_ratings = evaluation_ratings[evaluation_ratings["tag"] == question_tag].copy(deep=True) rating_by_course = tagged_ratings.groupby("course_id")["rating"].apply( list) # Aggregate responses across question variants. rating_by_course = rating_by_course.apply( lambda data: [sum(x) for x in zip(*data)]) # check that all the response arrays are the expected length lengths_invalid = rating_by_course.apply(len) != n_categories if any(lengths_invalid): raise database.InvariantError(f""" Invalid workload responses\n \tExpected length of 5: {rating_by_course[lengths_invalid]} """) rating_by_course = rating_by_course.apply(average_rating) rating_by_course = rating_by_course.to_dict() return rating_by_course
def import_evaluations( evaluation_narratives: pd.DataFrame, evaluation_ratings: pd.DataFrame, evaluation_statistics: pd.DataFrame, evaluation_questions: pd.DataFrame, listings: pd.DataFrame, ) -> Tuple[pd.DataFrame, ...]: """ Import course evaluations into Pandas DataFrame. Parameters ---------- evaluation_narratives: Table of narratives from /ferry/crawler/parse_ratings.py. evaluation_ratings: Table of ratings from /ferry/crawler/parse_ratings.py. evaluation_statistics: Table of statistics from /ferry/crawler/parse_ratings.py. evaluation_questions: Table of questions from /ferry/crawler/parse_ratings.py. listings: Table of listings from import_courses. Returns ------- evaluation_narratives, evaluation_ratings, evaluation_statistics, evaluation_questions """ ( evaluation_statistics, evaluation_narratives, evaluation_ratings, ) = match_evaluations_to_courses( evaluation_narratives, evaluation_ratings, evaluation_statistics, listings, ) # ------------------- # Aggregate questions # ------------------- # consistency checks print("Checking question text consistency") text_by_code = evaluation_questions.groupby( "question_code")[ # type: ignore "question_text"].apply(set) # focus on question texts with multiple variations text_by_code = text_by_code[text_by_code.apply(len) > 1] def amend_texts(texts: set) -> set: """ Remove extraneous texts. Parameters ---------- texts: Set of texts to amend. """ for remove_text in REMOVE_TEXTS: texts = {text.replace(remove_text, "") for text in texts} return texts text_by_code = text_by_code.apply(amend_texts) # add [0] at the end to account for empty lists max_diff_texts = max(list(text_by_code.apply(len)) + [0]) print( f"Maximum number of different texts per question code: {max_diff_texts}" ) # get the maximum distance between a set of texts def max_pairwise_distance(texts): pairs = combinations(texts, 2) distances = [ textdistance.levenshtein.distance(*pair) for pair in pairs ] return max(distances) distances_by_code = text_by_code.apply(max_pairwise_distance) # add [0] at the end to account for empty lists max_all_distances = max(list(distances_by_code) + [0]) print(f"Maximum text divergence within codes: {max_all_distances}") if not all(distances_by_code < QUESTION_DIVERGENCE_CUTOFF): inconsistent_codes = distances_by_code[ distances_by_code >= QUESTION_DIVERGENCE_CUTOFF] inconsistent_codes = list(inconsistent_codes.index) inconsistent_codes = ", ".join(inconsistent_codes) raise database.InvariantError( f"Error: question codes {inconsistent_codes} have divergent texts") print("Checking question type (narrative/rating) consistency") is_narrative_by_code = evaluation_questions.groupby( # type: ignore "question_code")["is_narrative"].apply(set) # check that a question code is always narrative or always rating if not all(is_narrative_by_code.apply(len) == 1): inconsistent_codes = is_narrative_by_code[ is_narrative_by_code.apply(len) != 1] inconsistent_codes = list(inconsistent_codes.index) inconsistent_codes = ", ".join(inconsistent_codes) raise database.InvariantError( f"Error: question codes {inconsistent_codes} have both narratives and ratings" ) # deduplicate questions and keep most recent evaluation_questions = evaluation_questions.sort_values(by="season", ascending=False) evaluation_questions.drop_duplicates( # type: ignore subset=["question_code"], keep="first", inplace=True) evaluation_questions["options"] = evaluation_questions["options"].replace( "NaN", "[]") # ------------------- # Clean up and subset # ------------------- # evaluation narratives ---------------- # filter out missing or short comments evaluation_narratives.dropna(subset=["comment"], inplace=True) # MIN_COMMENT_LENGTH = 2 evaluation_narratives = evaluation_narratives.loc[ evaluation_narratives["comment"].apply(len) > 2] # replace carriage returns for csv-based migration evaluation_narratives.loc[:, "comment"] = evaluation_narratives[ "comment"].apply(lambda x: x.replace("\r", "")) # id column for database primary key evaluation_narratives.loc[:, "id"] = list(range(len(evaluation_narratives))) evaluation_narratives.reset_index(drop=True, inplace=True) # evaluation ratings ---------------- # id column for database primary key evaluation_ratings.loc[:, "id"] = list(range(len(evaluation_ratings))) evaluation_ratings.reset_index(drop=True, inplace=True) # evaluation questions ---------------- # tag to be added later evaluation_questions["tag"] = "" evaluation_questions.reset_index(drop=True, inplace=True) # evaluation statistics ---------------- # explicitly specify missing columns to be filled in later evaluation_statistics[["avg_rating", "avg_workload", "enrollment"]] = np.nan # convert to JSON string for postgres evaluation_statistics.loc[:, "extras"] = evaluation_statistics[ "extras"].apply(ujson.dumps) evaluation_statistics.reset_index(drop=True, inplace=True) # extract columns to match database ---------------- evaluation_narratives = evaluation_narratives.loc[:, get_table_columns( database.models. EvaluationNarrative)] evaluation_ratings = evaluation_ratings.loc[:, get_table_columns( database.models. EvaluationRating)] evaluation_statistics = evaluation_statistics.loc[:, get_table_columns( database.models. EvaluationStatistics )] evaluation_questions = evaluation_questions.loc[:, get_table_columns( database.models. EvaluationQuestion)] print("[Summary]") print(f"Total evaluation narratives: {len(evaluation_narratives)}") print(f"Total evaluation ratings: {len(evaluation_ratings)}") print(f"Total evaluation statistics: {len(evaluation_statistics)}") print(f"Total evaluation questions: {len(evaluation_questions)}") return ( evaluation_narratives, evaluation_ratings, evaluation_statistics, evaluation_questions, )