def session_length(answers):
    '''
    Compute average length of session according to the session number.

    Args:
        answers (pandas.DataFrame):
            data frame containing answer data, if it is not decorated by 'session_number',
            it will be decorated
    Return:
        dict: session number -> number of answers
    '''
    if 'session_number' in answers:
        data = answers
    else:
        data = decorator.session_number(answers)
    return (data.
        groupby(['user', 'session_number']).
        apply(lambda x: len(x)).
        reset_index().
        rename(columns={0: 'session_length'}).
        groupby('session_number').
        apply(lambda x: x['session_length'].mean()).
        reset_index().
        rename(columns={0: 'session_length'}).
        set_index('session_number')['session_length'].
        to_dict())
def session_prior_skill_diffs(answers, difficulty_data, session_number_first, session_number_second):
    '''
    Compute prior skills for the given session numbers independently and return differences.
    The prior skill is computed only for the places answered in both sessions.
    Users with less than 10 places are ignored.

    Args:
        answers (pandas.DataFrame):
            data frame containing answer data, if it is not decorated by 'session_number',
            it will be decorated
        difficulty_data (dict):
            place -> difficulty
        session_number_first (int)
        session_number_second (int)
    Return:
        list: differences of prior skill for the given session numbers
    '''
    if 'session_number' in answers:
        data = answers
    else:
        data = decorator.session_number(answers)
    return (data.
        groupby('user').
        apply(lambda x: _session_prior_skill_diff_for_user(x, difficulty_data, session_number_first, session_number_second)).
        dropna().values)
def plot_session_length(figure, answers, portion_min=0.01, verbose=False):
    if 'session_number' in answers:
        data = answers
    else:
        data = decorator.session_number(answers)
    session_limit = max([session_number if portion >= portion_min else 0
        for session_number, portion in session.session_user_portion(answers).items()])
    data = data[data['session_number'] <= session_limit]

    length = session.session_length(data).items()
    ax1 = figure.add_subplot(111)
    ax1.plot(zip(*length)[0], zip(*length)[1], 'b-')
    ax1.set_xlabel('session number')
    ax1.set_ylabel('session length', color='b')
    for tl in ax1.get_yticklabels():
        tl.set_color('b')

    users_for_limit = data[data['session_number'] == session_limit]['user'].values
    data_for_limit = data[data['user'].isin(users_for_limit)]
    data_for_limit = data_for_limit[data_for_limit['session_number'] <= session_limit]
    for_limit_length = session.session_length(data_for_limit)
    ax1.plot(
        zip(*for_limit_length.items())[0],
        zip(*for_limit_length.items())[1], 'b--')

    hist = session.session_users(data).items()
    ax2 = ax1.twinx()
    ax2.set_yscale('log')
    ax2.set_ylabel('number of users', color='r')
    ax2.plot(zip(*hist)[0], zip(*hist)[1], 'r-.', linewidth=2)
    for tl in ax2.get_yticklabels():
        tl.set_color('r')
示例#4
0
def plot_session_success(figure, answers, portion_min=0.01, verbose=False):
    if 'session_number' in answers:
        data = answers
    else:
        data = decorator.session_number(answers)
    session_limit = max([
        session_number if portion >= portion_min else 0 for session_number,
        portion in session.session_user_portion(answers).items()
    ])
    data = data[data['session_number'] <= session_limit]

    success = session.session_success(data).items()
    hist = session.session_users(data).items()
    ax1 = figure.add_subplot(111)
    ax1.plot(zip(*success)[0], zip(*success)[1], 'b-')
    ax1.set_xlabel('session number')
    ax1.set_ylabel('success rate', color='b')
    for tl in ax1.get_yticklabels():
        tl.set_color('b')

    ax2 = ax1.twinx()
    ax2.set_yscale('log')
    ax2.set_ylabel('number of users', color='r')
    ax2.plot(zip(*hist)[0], zip(*hist)[1], 'r-.', linewidth=2)
    for tl in ax2.get_yticklabels():
        tl.set_color('r')
def plot_answers_vs_prior_skill(figure, answers, prior_skill, verbose=False):
    answers = decorator.session_number(answers)
    ax = figure.add_subplot(111)
    total = user.answers_per_user(answers)
    users = answers['user'].unique()
    vals = lambda x: [x[i] for i in users]
    total, prior_skill = zip(*sorted(zip(vals(total), vals(prior_skill))))
    ax.plot(total, prior_skill, 'o', alpha=0.3, linewidth=0, color='black')
    ax.set_xlabel('number of answer at all')
    ax.set_ylabel('prior skill')
    ax.set_xscale('log')
示例#6
0
def plot_answers_vs_prior_skill(figure, answers, prior_skill, verbose=False):
    answers = decorator.session_number(answers)
    ax = figure.add_subplot(111)
    total = user.answers_per_user(answers)
    users = answers['user'].unique()
    vals = lambda x: [x[i] for i in users]
    total, prior_skill = zip(*sorted(zip(vals(total), vals(prior_skill))))
    ax.plot(total, prior_skill, 'o', alpha=0.3, linewidth=0, color='black')
    ax.set_xlabel('number of answer at all')
    ax.set_ylabel('prior skill')
    ax.set_xscale('log')
def plot_first_session_vs_session_number(figure, answers, verbose=False):
    answers = decorator.session_number(answers)
    ax = figure.add_subplot(111)
    ses = user.session_per_user(answers)
    total_first = user.answers_per_user(answers[answers['session_number'] == 0])
    users = answers['user'].unique()
    vals = lambda x: [x.get(i, 0) for i in users]
    total_first, ses = zip(*sorted(zip(vals(total_first), vals(ses))))
    ax.plot(total_first, ses, 'o', alpha=0.3, linewidth=0, color='black')
    ax.set_xlabel('number of answers in the first session')
    ax.set_ylabel('maximal session number')
    ax.set_xscale('log')
def user_ratio(answers, session_number=None, answer_number_min=None, answer_number_max=None):
    answers = decorator.session_number(answers)

    def user_ratio_filter(data):
        if session_number is not None and session_number > data['session_number'].max():
            return False
        if answer_number_min is not None:
            return answer_number_min <= len(data)
        if answer_number_max is not None:
            return answer_number_max >= len(data)
        return True
    return sum(answers.groupby('user').apply(user_ratio_filter)), answers['user'].nunique()
示例#9
0
def plot_first_session_vs_session_number(figure, answers, verbose=False):
    answers = decorator.session_number(answers)
    ax = figure.add_subplot(111)
    ses = user.session_per_user(answers)
    total_first = user.answers_per_user(
        answers[answers['session_number'] == 0])
    users = answers['user'].unique()
    vals = lambda x: [x.get(i, 0) for i in users]
    total_first, ses = zip(*sorted(zip(vals(total_first), vals(ses))))
    ax.plot(total_first, ses, 'o', alpha=0.3, linewidth=0, color='black')
    ax.set_xlabel('number of answers in the first session')
    ax.set_ylabel('maximal session number')
    ax.set_xscale('log')
def plot_first_session_vs_total(figure, answers, verbose=False):
    answers = decorator.session_number(answers)
    ax = figure.add_subplot(111)
    total = user.answers_per_user(answers)
    total_first = user.answers_per_user(answers[answers['session_number'] == 0])
    users = answers['user'].unique()
    vals = lambda x: [x.get(i, 0) for i in users]
    pairs = map(lambda (x, y): (x, y - x), sorted(zip(vals(total_first), vals(total))))
    total_first, total = zip(*pairs)
    ax.plot(total_first, total, 'o', alpha=0.3, linewidth=0, color='black')
    ax.set_xlabel('number of answers in the first session')
    ax.set_ylabel('number of answer at all')
    ax.set_xscale('log')
    ax.set_yscale('log')
示例#11
0
def plot_first_session_vs_total(figure, answers, verbose=False):
    answers = decorator.session_number(answers)
    ax = figure.add_subplot(111)
    total = user.answers_per_user(answers)
    total_first = user.answers_per_user(
        answers[answers['session_number'] == 0])
    users = answers['user'].unique()
    vals = lambda x: [x.get(i, 0) for i in users]
    pairs = map(lambda (x, y): (x, y - x),
                sorted(zip(vals(total_first), vals(total))))
    total_first, total = zip(*pairs)
    ax.plot(total_first, total, 'o', alpha=0.3, linewidth=0, color='black')
    ax.set_xlabel('number of answers in the first session')
    ax.set_ylabel('number of answer at all')
    ax.set_xscale('log')
    ax.set_yscale('log')
def user_ratio(answers,
               session_number=None,
               answer_number_min=None,
               answer_number_max=None):
    answers = decorator.session_number(answers)

    def user_ratio_filter(data):
        if session_number is not None and session_number > data[
                'session_number'].max():
            return False
        if answer_number_min is not None:
            return answer_number_min <= len(data)
        if answer_number_max is not None:
            return answer_number_max >= len(data)
        return True

    return sum(answers.groupby('user').apply(
        user_ratio_filter)), answers['user'].nunique()
def session_user_portion(answers):
    '''
    For each session number compute how many users have answer with it.

    Args:
        answers (pandas.DataFrame):
            data frame containing answer data, if it is not decorated by 'session_number',
            it will be decorated
    Return:
        dict: session number -> number from (0,1)
    '''
    if 'session_number' in answers:
        data = answers
    else:
        data = decorator.session_number(answers)
    all_users = float(data['user'].nunique())
    return (data.
        groupby('session_number').
        apply(lambda x: x['user'].nunique() / all_users).
        to_dict())
def session_prior_skill(answers, difficulty_data):
    '''
    Compute average prior skill for each session.

    Args:
        answers (pandas.DataFrame):
            data frame containing answer data, if it is not decorated by 'session_number',
            it will be decorated
        difficulty_data (dict):
            place -> difficulty
    Return:
        dict: session number -> prior skill
    '''
    if 'session_number' in answers:
        data = answers
    else:
        data = decorator.session_number(answers)
    return (data.
        groupby('session_number').
        apply(lambda x: np.mean(difficulty.prepare_difficulty_and_prior_skill(x, difficulty_data)[1].values())).
        to_dict())
def session_success(answers):
    '''
    Compute success rate for each session.

    Args:
        answers (pandas.DataFrame):
            data frame containing answer data, if it is not decorated by 'session_number',
            it will be decorated
    Return:
        dict: session number -> success rate
    '''
    if 'session_number' in answers:
        data = answers
    else:
        data = decorator.session_number(answers)
    return (data.
        groupby(['user', 'session_number']).
        apply(lambda x: float(len(x[x['place_asked'] == x['place_answered']])) / float(len(x))).
        reset_index().
        rename(columns={0: 'success'}).
        groupby('session_number').
        apply(lambda x: x['success'].mean()).
        to_dict())
def session_users(answers):
    '''
    Compute number of users having answers in the given sessions according to
    the session number.

    Args:
        answers (pandas.DataFrame):
            data frame containing answer data, if it is not decorated by 'session_number',
            it will be decorated
    Return:
        dict: session number -> number of users
    '''
    if 'session_number' in answers:
        data = answers
    else:
        data = decorator.session_number(answers)
    return (data.
        groupby(['session_number']).
        apply(lambda x: x['user'].nunique()).
        reset_index().
        rename(columns={0: 'frequency'}).
        set_index('session_number')['frequency'].
        to_dict())
示例#17
0
def plot_session_length(figure, answers, portion_min=0.01, verbose=False):
    if 'session_number' in answers:
        data = answers
    else:
        data = decorator.session_number(answers)
    session_limit = max([
        session_number if portion >= portion_min else 0 for session_number,
        portion in session.session_user_portion(answers).items()
    ])
    data = data[data['session_number'] <= session_limit]

    length = session.session_length(data).items()
    ax1 = figure.add_subplot(111)
    ax1.plot(zip(*length)[0], zip(*length)[1], 'b-')
    ax1.set_xlabel('session number')
    ax1.set_ylabel('session length', color='b')
    for tl in ax1.get_yticklabels():
        tl.set_color('b')

    users_for_limit = data[data['session_number'] ==
                           session_limit]['user'].values
    data_for_limit = data[data['user'].isin(users_for_limit)]
    data_for_limit = data_for_limit[
        data_for_limit['session_number'] <= session_limit]
    for_limit_length = session.session_length(data_for_limit)
    ax1.plot(
        zip(*for_limit_length.items())[0],
        zip(*for_limit_length.items())[1], 'b--')

    hist = session.session_users(data).items()
    ax2 = ax1.twinx()
    ax2.set_yscale('log')
    ax2.set_ylabel('number of users', color='r')
    ax2.plot(zip(*hist)[0], zip(*hist)[1], 'r-.', linewidth=2)
    for tl in ax2.get_yticklabels():
        tl.set_color('r')
def plot_session_success(figure, answers, portion_min=0.01, verbose=False):
    if 'session_number' in answers:
        data = answers
    else:
        data = decorator.session_number(answers)
    session_limit = max([session_number if portion >= portion_min else 0
        for session_number, portion in session.session_user_portion(answers).items()])
    data = data[data['session_number'] <= session_limit]

    success = session.session_success(data).items()
    hist = session.session_users(data).items()
    ax1 = figure.add_subplot(111)
    ax1.plot(zip(*success)[0], zip(*success)[1], 'b-')
    ax1.set_xlabel('session number')
    ax1.set_ylabel('success rate', color='b')
    for tl in ax1.get_yticklabels():
        tl.set_color('b')

    ax2 = ax1.twinx()
    ax2.set_yscale('log')
    ax2.set_ylabel('number of users', color='r')
    ax2.plot(zip(*hist)[0], zip(*hist)[1], 'r-.', linewidth=2)
    for tl in ax2.get_yticklabels():
        tl.set_color('r')
def session_per_user(answers):
    answers = decorator.session_number(answers)
    return answers.groupby('user').apply(
        lambda x: x['session_number'].max()).to_dict()
def session_per_user(answers):
    answers = decorator.session_number(answers)
    return answers.groupby('user').apply(lambda x: x['session_number'].max()).to_dict()