Exemplo n.º 1
0
def question_tag_invariant(session: sqlalchemy.orm.session.Session):
    """
    Check question tag invariants.

    Check invariant:
        all questions sharing a tag also share is_narrative and len(options)
    """
    # Dictionary of question_code -> (is_narrative, len(options))
    tag_cache = {}

    def optlen(options):
        return len(options) if options else -1

    for question in session.query(
            database.EvaluationQuestion):  # type: database.EvaluationQuestion
        if not question.tag:
            continue

        if question.tag not in tag_cache:
            tag_cache[question.tag] = (question.is_narrative,
                                       optlen(question.options))
        else:
            narrative, count = tag_cache[question.tag]
            if question.is_narrative != narrative or count != optlen(
                    question.options):
                raise database.InvariantError(f"mismatched tag {question.tag}")
Exemplo n.º 2
0
def question_invariants(session: sqlalchemy.orm.session.Session):
    """
    Check question invariants.

    Check invariant:
        evaluation_questions.options is null iff evaluation_questions.is_narrative = True
    """
    for question in session.query(
            database.EvaluationQuestion):  # type: database.EvaluationQuestion
        narrative = question.is_narrative
        options = bool(question.options)
        if narrative and options:
            raise database.InvariantError(
                f"narrative question {question} has options")
        if not narrative and not options:
            raise database.InvariantError(
                f"ratings question {question} lacks options")
Exemplo n.º 3
0
def listing_invariants(session: sqlalchemy.orm.session.Session):
    """
    Check listing invariants.

    Check invariant:
        listing.season_code == course.season_code if listing.course_id == course.course_id.
    """
    for listing_id, course_id, listing_season_code, course_season_code in session.query(
            database.Listing.listing_id,
            database.Listing.course_id,
            database.Listing.season_code,
            database.Course.season_code,
    ).filter(database.Listing.course_id == database.Course.course_id):
        if listing_season_code != course_season_code:
            raise database.InvariantError(
                f"listing {listing_id} has mismatched season_code with course {course_id}"
            )
Exemplo n.º 4
0
    def assign_code(row):

        code = row["question_code"]

        # Remove these suffixes for tag resolution.
        strip_suffixes = ["-YCWR", "-YXWR", "-SA"]

        for suffix in strip_suffixes:
            if code.endswith(suffix):
                code = code[:-len(suffix)]
                break

        # Set the appropriate question tag.
        try:
            return QUESTION_TAGS[code]
        except KeyError as err:
            raise database.InvariantError(
                f"No associated tag for question code {code} with text {row['question_text']}"
            ) from err
Exemplo n.º 5
0
def course_invariants(session: sqlalchemy.orm.session.Session):
    """
    Check course invariants.

    Check invariant:
        every course should have at least one listing.
    """
    courses_no_listings = (session.query(database.Course).select_from(
        database.Listing).join(database.Listing.course, isouter=True).group_by(
            database.Course.course_id).having(
                sqlalchemy.func.count(database.Listing.listing_id) == 0)
                           ).all()

    if courses_no_listings:

        no_listing_courses = [str(course) for course in courses_no_listings]

        raise database.InvariantError(
            f"the following courses have no listings: {', '.join(no_listing_courses)}"
        )
Exemplo n.º 6
0
    def average_by_course(question_tag, n_categories):

        tagged_ratings = evaluation_ratings[evaluation_ratings["tag"] ==
                                            question_tag].copy(deep=True)
        rating_by_course = tagged_ratings.groupby("course_id")["rating"].apply(
            list)

        # Aggregate responses across question variants.
        rating_by_course = rating_by_course.apply(
            lambda data: [sum(x) for x in zip(*data)])

        # check that all the response arrays are the expected length
        lengths_invalid = rating_by_course.apply(len) != n_categories

        if any(lengths_invalid):
            raise database.InvariantError(f"""
                Invalid workload responses\n
                \tExpected length of 5: {rating_by_course[lengths_invalid]}
                """)

        rating_by_course = rating_by_course.apply(average_rating)
        rating_by_course = rating_by_course.to_dict()

        return rating_by_course
Exemplo n.º 7
0
def import_evaluations(
    evaluation_narratives: pd.DataFrame,
    evaluation_ratings: pd.DataFrame,
    evaluation_statistics: pd.DataFrame,
    evaluation_questions: pd.DataFrame,
    listings: pd.DataFrame,
) -> Tuple[pd.DataFrame, ...]:
    """
    Import course evaluations into Pandas DataFrame.

    Parameters
    ----------
    evaluation_narratives:
        Table of narratives from /ferry/crawler/parse_ratings.py.
    evaluation_ratings:
        Table of ratings from /ferry/crawler/parse_ratings.py.
    evaluation_statistics:
        Table of statistics from /ferry/crawler/parse_ratings.py.
    evaluation_questions:
        Table of questions from /ferry/crawler/parse_ratings.py.
    listings:
        Table of listings from import_courses.

    Returns
    -------
    evaluation_narratives,
    evaluation_ratings,
    evaluation_statistics,
    evaluation_questions
    """
    (
        evaluation_statistics,
        evaluation_narratives,
        evaluation_ratings,
    ) = match_evaluations_to_courses(
        evaluation_narratives,
        evaluation_ratings,
        evaluation_statistics,
        listings,
    )

    # -------------------
    # Aggregate questions
    # -------------------

    # consistency checks
    print("Checking question text consistency")
    text_by_code = evaluation_questions.groupby(
        "question_code")[  # type: ignore
            "question_text"].apply(set)

    # focus on question texts with multiple variations
    text_by_code = text_by_code[text_by_code.apply(len) > 1]

    def amend_texts(texts: set) -> set:
        """
        Remove extraneous texts.

        Parameters
        ----------
        texts:
            Set of texts to amend.
        """

        for remove_text in REMOVE_TEXTS:

            texts = {text.replace(remove_text, "") for text in texts}

        return texts

    text_by_code = text_by_code.apply(amend_texts)

    # add [0] at the end to account for empty lists
    max_diff_texts = max(list(text_by_code.apply(len)) + [0])
    print(
        f"Maximum number of different texts per question code: {max_diff_texts}"
    )

    # get the maximum distance between a set of texts
    def max_pairwise_distance(texts):

        pairs = combinations(texts, 2)
        distances = [
            textdistance.levenshtein.distance(*pair) for pair in pairs
        ]

        return max(distances)

    distances_by_code = text_by_code.apply(max_pairwise_distance)
    # add [0] at the end to account for empty lists
    max_all_distances = max(list(distances_by_code) + [0])

    print(f"Maximum text divergence within codes: {max_all_distances}")

    if not all(distances_by_code < QUESTION_DIVERGENCE_CUTOFF):

        inconsistent_codes = distances_by_code[
            distances_by_code >= QUESTION_DIVERGENCE_CUTOFF]
        inconsistent_codes = list(inconsistent_codes.index)
        inconsistent_codes = ", ".join(inconsistent_codes)

        raise database.InvariantError(
            f"Error: question codes {inconsistent_codes} have divergent texts")

    print("Checking question type (narrative/rating) consistency")
    is_narrative_by_code = evaluation_questions.groupby(  # type: ignore
        "question_code")["is_narrative"].apply(set)

    # check that a question code is always narrative or always rating
    if not all(is_narrative_by_code.apply(len) == 1):
        inconsistent_codes = is_narrative_by_code[
            is_narrative_by_code.apply(len) != 1]
        inconsistent_codes = list(inconsistent_codes.index)
        inconsistent_codes = ", ".join(inconsistent_codes)
        raise database.InvariantError(
            f"Error: question codes {inconsistent_codes} have both narratives and ratings"
        )

    # deduplicate questions and keep most recent
    evaluation_questions = evaluation_questions.sort_values(by="season",
                                                            ascending=False)
    evaluation_questions.drop_duplicates(  # type: ignore
        subset=["question_code"], keep="first", inplace=True)

    evaluation_questions["options"] = evaluation_questions["options"].replace(
        "NaN", "[]")

    # -------------------
    # Clean up and subset
    # -------------------

    # evaluation narratives ----------------

    # filter out missing or short comments
    evaluation_narratives.dropna(subset=["comment"], inplace=True)

    # MIN_COMMENT_LENGTH = 2
    evaluation_narratives = evaluation_narratives.loc[
        evaluation_narratives["comment"].apply(len) > 2]
    # replace carriage returns for csv-based migration
    evaluation_narratives.loc[:, "comment"] = evaluation_narratives[
        "comment"].apply(lambda x: x.replace("\r", ""))
    # id column for database primary key
    evaluation_narratives.loc[:,
                              "id"] = list(range(len(evaluation_narratives)))
    evaluation_narratives.reset_index(drop=True, inplace=True)

    # evaluation ratings ----------------

    # id column for database primary key
    evaluation_ratings.loc[:, "id"] = list(range(len(evaluation_ratings)))
    evaluation_ratings.reset_index(drop=True, inplace=True)

    # evaluation questions ----------------

    # tag to be added later
    evaluation_questions["tag"] = ""
    evaluation_questions.reset_index(drop=True, inplace=True)

    # evaluation statistics ----------------

    # explicitly specify missing columns to be filled in later
    evaluation_statistics[["avg_rating", "avg_workload",
                           "enrollment"]] = np.nan
    # convert to JSON string for postgres
    evaluation_statistics.loc[:, "extras"] = evaluation_statistics[
        "extras"].apply(ujson.dumps)
    evaluation_statistics.reset_index(drop=True, inplace=True)

    # extract columns to match database  ----------------
    evaluation_narratives = evaluation_narratives.loc[:,
                                                      get_table_columns(
                                                          database.models.
                                                          EvaluationNarrative)]
    evaluation_ratings = evaluation_ratings.loc[:,
                                                get_table_columns(
                                                    database.models.
                                                    EvaluationRating)]
    evaluation_statistics = evaluation_statistics.loc[:,
                                                      get_table_columns(
                                                          database.models.
                                                          EvaluationStatistics
                                                      )]
    evaluation_questions = evaluation_questions.loc[:,
                                                    get_table_columns(
                                                        database.models.
                                                        EvaluationQuestion)]

    print("[Summary]")
    print(f"Total evaluation narratives: {len(evaluation_narratives)}")
    print(f"Total evaluation ratings: {len(evaluation_ratings)}")
    print(f"Total evaluation statistics: {len(evaluation_statistics)}")
    print(f"Total evaluation questions: {len(evaluation_questions)}")

    return (
        evaluation_narratives,
        evaluation_ratings,
        evaluation_statistics,
        evaluation_questions,
    )