def session_length(answers): ''' Compute average length of session according to the session number. Args: answers (pandas.DataFrame): data frame containing answer data, if it is not decorated by 'session_number', it will be decorated Return: dict: session number -> number of answers ''' if 'session_number' in answers: data = answers else: data = decorator.session_number(answers) return (data. groupby(['user', 'session_number']). apply(lambda x: len(x)). reset_index(). rename(columns={0: 'session_length'}). groupby('session_number'). apply(lambda x: x['session_length'].mean()). reset_index(). rename(columns={0: 'session_length'}). set_index('session_number')['session_length']. to_dict())
def session_prior_skill_diffs(answers, difficulty_data, session_number_first, session_number_second): ''' Compute prior skills for the given session numbers independently and return differences. The prior skill is computed only for the places answered in both sessions. Users with less than 10 places are ignored. Args: answers (pandas.DataFrame): data frame containing answer data, if it is not decorated by 'session_number', it will be decorated difficulty_data (dict): place -> difficulty session_number_first (int) session_number_second (int) Return: list: differences of prior skill for the given session numbers ''' if 'session_number' in answers: data = answers else: data = decorator.session_number(answers) return (data. groupby('user'). apply(lambda x: _session_prior_skill_diff_for_user(x, difficulty_data, session_number_first, session_number_second)). dropna().values)
def plot_session_length(figure, answers, portion_min=0.01, verbose=False): if 'session_number' in answers: data = answers else: data = decorator.session_number(answers) session_limit = max([session_number if portion >= portion_min else 0 for session_number, portion in session.session_user_portion(answers).items()]) data = data[data['session_number'] <= session_limit] length = session.session_length(data).items() ax1 = figure.add_subplot(111) ax1.plot(zip(*length)[0], zip(*length)[1], 'b-') ax1.set_xlabel('session number') ax1.set_ylabel('session length', color='b') for tl in ax1.get_yticklabels(): tl.set_color('b') users_for_limit = data[data['session_number'] == session_limit]['user'].values data_for_limit = data[data['user'].isin(users_for_limit)] data_for_limit = data_for_limit[data_for_limit['session_number'] <= session_limit] for_limit_length = session.session_length(data_for_limit) ax1.plot( zip(*for_limit_length.items())[0], zip(*for_limit_length.items())[1], 'b--') hist = session.session_users(data).items() ax2 = ax1.twinx() ax2.set_yscale('log') ax2.set_ylabel('number of users', color='r') ax2.plot(zip(*hist)[0], zip(*hist)[1], 'r-.', linewidth=2) for tl in ax2.get_yticklabels(): tl.set_color('r')
def plot_session_success(figure, answers, portion_min=0.01, verbose=False): if 'session_number' in answers: data = answers else: data = decorator.session_number(answers) session_limit = max([ session_number if portion >= portion_min else 0 for session_number, portion in session.session_user_portion(answers).items() ]) data = data[data['session_number'] <= session_limit] success = session.session_success(data).items() hist = session.session_users(data).items() ax1 = figure.add_subplot(111) ax1.plot(zip(*success)[0], zip(*success)[1], 'b-') ax1.set_xlabel('session number') ax1.set_ylabel('success rate', color='b') for tl in ax1.get_yticklabels(): tl.set_color('b') ax2 = ax1.twinx() ax2.set_yscale('log') ax2.set_ylabel('number of users', color='r') ax2.plot(zip(*hist)[0], zip(*hist)[1], 'r-.', linewidth=2) for tl in ax2.get_yticklabels(): tl.set_color('r')
def plot_answers_vs_prior_skill(figure, answers, prior_skill, verbose=False): answers = decorator.session_number(answers) ax = figure.add_subplot(111) total = user.answers_per_user(answers) users = answers['user'].unique() vals = lambda x: [x[i] for i in users] total, prior_skill = zip(*sorted(zip(vals(total), vals(prior_skill)))) ax.plot(total, prior_skill, 'o', alpha=0.3, linewidth=0, color='black') ax.set_xlabel('number of answer at all') ax.set_ylabel('prior skill') ax.set_xscale('log')
def plot_first_session_vs_session_number(figure, answers, verbose=False): answers = decorator.session_number(answers) ax = figure.add_subplot(111) ses = user.session_per_user(answers) total_first = user.answers_per_user(answers[answers['session_number'] == 0]) users = answers['user'].unique() vals = lambda x: [x.get(i, 0) for i in users] total_first, ses = zip(*sorted(zip(vals(total_first), vals(ses)))) ax.plot(total_first, ses, 'o', alpha=0.3, linewidth=0, color='black') ax.set_xlabel('number of answers in the first session') ax.set_ylabel('maximal session number') ax.set_xscale('log')
def user_ratio(answers, session_number=None, answer_number_min=None, answer_number_max=None): answers = decorator.session_number(answers) def user_ratio_filter(data): if session_number is not None and session_number > data['session_number'].max(): return False if answer_number_min is not None: return answer_number_min <= len(data) if answer_number_max is not None: return answer_number_max >= len(data) return True return sum(answers.groupby('user').apply(user_ratio_filter)), answers['user'].nunique()
def plot_first_session_vs_session_number(figure, answers, verbose=False): answers = decorator.session_number(answers) ax = figure.add_subplot(111) ses = user.session_per_user(answers) total_first = user.answers_per_user( answers[answers['session_number'] == 0]) users = answers['user'].unique() vals = lambda x: [x.get(i, 0) for i in users] total_first, ses = zip(*sorted(zip(vals(total_first), vals(ses)))) ax.plot(total_first, ses, 'o', alpha=0.3, linewidth=0, color='black') ax.set_xlabel('number of answers in the first session') ax.set_ylabel('maximal session number') ax.set_xscale('log')
def plot_first_session_vs_total(figure, answers, verbose=False): answers = decorator.session_number(answers) ax = figure.add_subplot(111) total = user.answers_per_user(answers) total_first = user.answers_per_user(answers[answers['session_number'] == 0]) users = answers['user'].unique() vals = lambda x: [x.get(i, 0) for i in users] pairs = map(lambda (x, y): (x, y - x), sorted(zip(vals(total_first), vals(total)))) total_first, total = zip(*pairs) ax.plot(total_first, total, 'o', alpha=0.3, linewidth=0, color='black') ax.set_xlabel('number of answers in the first session') ax.set_ylabel('number of answer at all') ax.set_xscale('log') ax.set_yscale('log')
def plot_first_session_vs_total(figure, answers, verbose=False): answers = decorator.session_number(answers) ax = figure.add_subplot(111) total = user.answers_per_user(answers) total_first = user.answers_per_user( answers[answers['session_number'] == 0]) users = answers['user'].unique() vals = lambda x: [x.get(i, 0) for i in users] pairs = map(lambda (x, y): (x, y - x), sorted(zip(vals(total_first), vals(total)))) total_first, total = zip(*pairs) ax.plot(total_first, total, 'o', alpha=0.3, linewidth=0, color='black') ax.set_xlabel('number of answers in the first session') ax.set_ylabel('number of answer at all') ax.set_xscale('log') ax.set_yscale('log')
def user_ratio(answers, session_number=None, answer_number_min=None, answer_number_max=None): answers = decorator.session_number(answers) def user_ratio_filter(data): if session_number is not None and session_number > data[ 'session_number'].max(): return False if answer_number_min is not None: return answer_number_min <= len(data) if answer_number_max is not None: return answer_number_max >= len(data) return True return sum(answers.groupby('user').apply( user_ratio_filter)), answers['user'].nunique()
def session_user_portion(answers): ''' For each session number compute how many users have answer with it. Args: answers (pandas.DataFrame): data frame containing answer data, if it is not decorated by 'session_number', it will be decorated Return: dict: session number -> number from (0,1) ''' if 'session_number' in answers: data = answers else: data = decorator.session_number(answers) all_users = float(data['user'].nunique()) return (data. groupby('session_number'). apply(lambda x: x['user'].nunique() / all_users). to_dict())
def session_prior_skill(answers, difficulty_data): ''' Compute average prior skill for each session. Args: answers (pandas.DataFrame): data frame containing answer data, if it is not decorated by 'session_number', it will be decorated difficulty_data (dict): place -> difficulty Return: dict: session number -> prior skill ''' if 'session_number' in answers: data = answers else: data = decorator.session_number(answers) return (data. groupby('session_number'). apply(lambda x: np.mean(difficulty.prepare_difficulty_and_prior_skill(x, difficulty_data)[1].values())). to_dict())
def session_success(answers): ''' Compute success rate for each session. Args: answers (pandas.DataFrame): data frame containing answer data, if it is not decorated by 'session_number', it will be decorated Return: dict: session number -> success rate ''' if 'session_number' in answers: data = answers else: data = decorator.session_number(answers) return (data. groupby(['user', 'session_number']). apply(lambda x: float(len(x[x['place_asked'] == x['place_answered']])) / float(len(x))). reset_index(). rename(columns={0: 'success'}). groupby('session_number'). apply(lambda x: x['success'].mean()). to_dict())
def session_users(answers): ''' Compute number of users having answers in the given sessions according to the session number. Args: answers (pandas.DataFrame): data frame containing answer data, if it is not decorated by 'session_number', it will be decorated Return: dict: session number -> number of users ''' if 'session_number' in answers: data = answers else: data = decorator.session_number(answers) return (data. groupby(['session_number']). apply(lambda x: x['user'].nunique()). reset_index(). rename(columns={0: 'frequency'}). set_index('session_number')['frequency']. to_dict())
def plot_session_length(figure, answers, portion_min=0.01, verbose=False): if 'session_number' in answers: data = answers else: data = decorator.session_number(answers) session_limit = max([ session_number if portion >= portion_min else 0 for session_number, portion in session.session_user_portion(answers).items() ]) data = data[data['session_number'] <= session_limit] length = session.session_length(data).items() ax1 = figure.add_subplot(111) ax1.plot(zip(*length)[0], zip(*length)[1], 'b-') ax1.set_xlabel('session number') ax1.set_ylabel('session length', color='b') for tl in ax1.get_yticklabels(): tl.set_color('b') users_for_limit = data[data['session_number'] == session_limit]['user'].values data_for_limit = data[data['user'].isin(users_for_limit)] data_for_limit = data_for_limit[ data_for_limit['session_number'] <= session_limit] for_limit_length = session.session_length(data_for_limit) ax1.plot( zip(*for_limit_length.items())[0], zip(*for_limit_length.items())[1], 'b--') hist = session.session_users(data).items() ax2 = ax1.twinx() ax2.set_yscale('log') ax2.set_ylabel('number of users', color='r') ax2.plot(zip(*hist)[0], zip(*hist)[1], 'r-.', linewidth=2) for tl in ax2.get_yticklabels(): tl.set_color('r')
def plot_session_success(figure, answers, portion_min=0.01, verbose=False): if 'session_number' in answers: data = answers else: data = decorator.session_number(answers) session_limit = max([session_number if portion >= portion_min else 0 for session_number, portion in session.session_user_portion(answers).items()]) data = data[data['session_number'] <= session_limit] success = session.session_success(data).items() hist = session.session_users(data).items() ax1 = figure.add_subplot(111) ax1.plot(zip(*success)[0], zip(*success)[1], 'b-') ax1.set_xlabel('session number') ax1.set_ylabel('success rate', color='b') for tl in ax1.get_yticklabels(): tl.set_color('b') ax2 = ax1.twinx() ax2.set_yscale('log') ax2.set_ylabel('number of users', color='r') ax2.plot(zip(*hist)[0], zip(*hist)[1], 'r-.', linewidth=2) for tl in ax2.get_yticklabels(): tl.set_color('r')
def session_per_user(answers): answers = decorator.session_number(answers) return answers.groupby('user').apply( lambda x: x['session_number'].max()).to_dict()
def session_per_user(answers): answers = decorator.session_number(answers) return answers.groupby('user').apply(lambda x: x['session_number'].max()).to_dict()