user_topic - tuple of (user, topic ID)
    user_segment - group(s) that the user is a member of for dashboard
        comparison purposes (eg. A/B test experiments, has coach, etc.)
    """

    is_test = lambda info: info.get('purpose', None) == 'randomized'
    test_cards = [(i, x[0]) for i, x in enumerate(attempts, 1) if
                 is_test(x[2])]

    for i in range(1, len(test_cards)):
        prev_card, curr_card = test_cards[i - 1], test_cards[i]
        total_gain = float(curr_card[1]) - float(prev_card[1])
        incremental_gain = total_gain / (curr_card[0] - prev_card[0])
        topic = user_topic[1] if user_topic and len(user_topic) >= 2 else None
        if topic == "any":
            # it is not cool to analyze cards done from various stacks
            # as if they were done with one big, generic stack.  for example,
            # if a user moved from an easy to a difficult topic, you would 
            # likely see accuracy drop on the randomized cards, even though
            # this is very healthy user behavior.
            return
        for i in range(prev_card[0], curr_card[0]):
            # TODO(david): Output and group by user segments (eg.
            #     experiments the user was in).
            print '%s\t%s\t%s\t%s\t%s' % (topic, user_segment, len(attempts),
                    i, incremental_gain)


if __name__ == '__main__':
    table_parser.parse_user_topic_input(emit_accuracy_deltas)
Exemplo n.º 2
0
def emit_topic_retention(attempts, user_topic, user_segment):
    """Outputs a row for every (topic, segment, card number) to be aggregated
    in Hive.

    More precisely, output row has values
    <topic, user segment, randomized?, "card_number", card #, correct (1 or 0)>

    attempts - a list of exercise card attempts in a topic, ordered by time
        done. Each element is a tuple (bool correct, int problem_number,
        dict scheduler_info).
    user_topic - tuple of (user, topic ID)
    user_segment - group(s) that the user is a member of for dashboard
        comparison purposes (eg. A/B test experiments, has coach, etc.)
    """
    def is_randomized(info):
        # Hive only casts empty strings from custom scripts to false
        return 'TRUE' if info.get('purpose', None) == 'randomized' else ''

    user, topic = user_topic

    # Output retention stats by card number
    # TODO(david): Output time taken buckets
    for i, attempt in enumerate(attempts, 1):
        print '%s\t%s\t%s\t%s\t%s\t%s' % (topic, user_segment,
                                          is_randomized(attempt[2]),
                                          "card_number", i, int(attempt[0]))


if __name__ == '__main__':
    table_parser.parse_user_topic_input(emit_topic_retention)
def emit_topic_retention(attempts, user_topic, user_segment):
    """Outputs a row for every (topic, segment, card number) to be aggregated
    in Hive.

    More precisely, output row has values
    <topic, user segment, randomized?, "card_number", card #, correct (1 or 0)>

    attempts - a list of exercise card attempts in a topic, ordered by time
        done. Each element is a tuple (bool correct, int problem_number,
        dict scheduler_info).
    user_topic - tuple of (user, topic ID)
    user_segment - group(s) that the user is a member of for dashboard
        comparison purposes (eg. A/B test experiments, has coach, etc.)
    """
    def is_randomized(info):
        # Hive only casts empty strings from custom scripts to false
        return 'TRUE' if info.get('purpose', None) == 'randomized' else ''

    user, topic = user_topic

    # Output retention stats by card number
    # TODO(david): Output time taken buckets
    for i, attempt in enumerate(attempts, 1):
        print '%s\t%s\t%s\t%s\t%s\t%s' % (topic, user_segment,
                is_randomized(attempt[2]), "card_number", i, int(attempt[0]))


if __name__ == '__main__':
    table_parser.parse_user_topic_input(emit_topic_retention)