예제 #1
0
def filter_history(history, min_num_ixns=5, max_num_ixns=9223372036854775807):
    """
 Filter history for students with histories of bounded length,
 and modules with enough interactions

 :param datatools.InteractionHistory history: An interaction history
 :param int min_num_ixns: Minimum number of timesteps in student history,
     and minimum number of interactions for module

 :param int max_num_ixns: Maximum number of timesteps in student history
 :rtype: datatools.InteractionHistory
 :return: A filtered interaction history
 """
    students = set(
        history.data['student_id'][(history.data['timestep'] > min_num_ixns)
                                   & (history.data['module_type'] == datatools.
                                      AssessmentInteraction.MODULETYPE)])
    students -= set(
        history.data['student_id'][history.data['timestep'] >= max_num_ixns])

    modules = {
        module_id
        for module_id, group in history.data.groupby('module_id')
        if len(group) > min_num_ixns
    }

    return datatools.InteractionHistory(
        history.data[(history.data['student_id'].isin(students))
                     & (history.data['module_id'].isin(modules))],
        reindex_timesteps=True,
        size_of_test_set=0.2)
예제 #2
0
def interaction_history_from_dutch_big_data_set(data):
    """
 Parse MTurk data set into an interaction history

 :param pd.DataFrame data: A dataframe of raw log data
 :rtype: datatools.InteractionHistory
 :return: An interaction history object
 """

    data = data[['user_id', 'student_id', 'module_id', 'outcome', 'timestamp']]
    data.columns = [
        'user_id', 'student_id', 'module_id', 'outcome', 'timestamp'
    ]

    student_timesteps = defaultdict(int)
    timesteps = [None] * len(data)
    for i, (_, ixn) in enumerate(data.iterrows()):
        student_timesteps[ixn['student_id']] += 1
        timesteps[i] = student_timesteps[ixn['student_id']]
    data['timestep'] = timesteps

    data['module_type'] = [datatools.AssessmentInteraction.MODULETYPE
                           ] * len(data)

    return datatools.InteractionHistory(data, sort_by_timestep=True)
예제 #3
0
def interaction_history_from_mnemosyne_data_set(data):
    """
 Parse Mnemosyne data set into an interaction history

 :param pd.DataFrame data: A dataframe of raw log data
 :rtype: datatools.InteractionHistory
 :return: An interaction history object
 """

    data = data[data['grade'].apply(lambda x: not np.isnan(x))]

    data = data[[
        'user_id', 'student_id', 'object_id', 'grade', 'timestamp',
        'thinking_time', 'actual_interval', 'scheduled_interval'
    ]]
    data.columns = [
        'user_id', 'student_id', 'module_id', 'outcome', 'timestamp',
        'duration', 'actual_interval', 'scheduled_interval'
    ]

    data['outcome'] = data['outcome'].apply(lambda x: x > 1)

    student_timesteps = defaultdict(int)
    timesteps = [None] * len(data)
    for i, (_, ixn) in enumerate(data.iterrows()):
        student_timesteps[ixn['student_id']] += 1
        timesteps[i] = student_timesteps[ixn['student_id']]
    data['timestep'] = timesteps

    data['module_type'] = [datatools.AssessmentInteraction.MODULETYPE
                           ] * len(data)

    return datatools.InteractionHistory(data, sort_by_timestep=True)
예제 #4
0
def interaction_history_from_assistments_data_set(data,
                                                  duration_column='timestep',
                                                  module_id_column='problem_id'
                                                  ):
    """
    Parse dataframe of assistments interactions into an interaction history

    :param pd.DataFrame assistments_data: A raw history from assistments
    :param str duration_column: Column to use as interaction duration
    :param str module_id_column: Column to use as module_id
    :rtype: datatools.InteractionHistory
    :return: An interaction history
    """
    # sort by order_id
    data.sort_values(by='order_id', inplace=True, axis=0)

    # get relevant columns and rename them
    data = data[['user_id', 'correct', duration_column, module_id_column]]
    data.columns = ['user_id', 'outcome', 'duration', 'module_id']

    # only keep interactions with binary outcomes and positive response times
    data = data[((data['outcome'] == 1) | (data['outcome'] == 0))
                & (data['duration'] > 0)]

    # cast outcomes from int to bool
    data['outcome'] = data['outcome'].apply(lambda x: x == 1)

    # map response times from milliseconds to seconds
    data['duration'] = data['duration'].apply(lambda x: x / 1000)

    # existing interactions are all assessment interactions
    data['module_type'] = [datatools.AssessmentInteraction.MODULETYPE
                           ] * len(data)

    # add timesteps
    timesteps = [None] * len(data)
    student_timesteps = defaultdict(int)
    for i, (_, ixn) in enumerate(data.iterrows()):
        student_timesteps[ixn['user_id']] += 1
        timesteps[i] = student_timesteps[ixn['user_id']]
    data['timestep'] = timesteps

    # add artificial lesson interactions
    lesson_data = data.copy(deep=True)
    lesson_data['module_type'] = [datatools.LessonInteraction.MODULETYPE
                                  ] * len(data)

    return datatools.InteractionHistory(pd.concat([data, lesson_data]),
                                        sort_by_timestep=True)
예제 #5
0
def cli(history_file, results_file, verbose, num_folds, truncation_style,
        using_lessons, using_prereqs, using_bias, embedding_dimension,
        learning_update_variance, opt_algo, regularization_constant, ftol,
        learning_rate, adagrad_eta, adagrad_eps):
    """
    This script provides a command-line interface for model evaluation.
    It reads an interaction history from file, computes the cross-validated AUC of
    an embedding model, and writes the results to file.

    The pickled results will be an object of type :py:class:`evaluate.CVResults`

    :param str history_file: Input path to CSV/pickle file containing interaction history
    :param str results_file: Output path for pickled results of cross-validation
    :param bool verbose: True => logger level set to logging.INFO
    :param int num_folds: Number of folds in k-fold cross-validation
    :param str truncation_style: Hold-out scheme for student histories
    :param bool using_lessons: Including lessons in embedding
    :param bool using_prereqs: Including lesson prereqs in embedding
    :param bool using_bias: Including bias terms in embedding
    :param int embedding_dimension: Number of dimensions in latent skill space
    :param float learning_update_variance: Variance of Gaussian learning update
    :param str opt_algo: Optimization algorithm for parameter estimation
    :param float regularization_constant: Coefficient of regularization term in objective function
    :param float ftol: Stopping condition for iterative optimization
    :param float learning_rate: Fixed learning rate for gradient descent
    :param float adagrad_eta: Base learning rate parameter for Adagrad
    :param float adagrad_eps: Epsilon parameter for Adagrad
    """

    if verbose and opt_algo == 'l-bfgs-b':
        raise ValueError(
            'Verbose mode is not currently supported for L-BFGS-B.\
                Try turning off verbose mode, or change your choice of optimization algorithm.'
        )

    if verbose:
        _logger.setLevel(logging.DEBUG)

    click.echo('Loading interaction history from %s...' %
               click.format_filename(history_file))

    _, history_file_ext = os.path.splitext(history_file)
    if history_file_ext == '.csv':
        data = pd.DataFrame.from_csv(history_file)
        history = datatools.InteractionHistory(pd.read_csv(history_file))
    elif history_file_ext == '.pkl':
        with open(history_file, 'rb') as f:
            history = pickle.load(f)
    else:
        raise ValueError('Unrecognized file extension for history_file.\
                Please supply a .csv with an interaction history, or a .pkl file containing\
                a datatools.InteractionHistory object.')

    embedding_kwargs = {
        'embedding_dimension': embedding_dimension,
        'using_lessons': using_lessons,
        'using_prereqs': using_prereqs,
        'using_bias': using_bias,
        'learning_update_variance_constant': learning_update_variance
    }

    gradient_descent_kwargs = {
        'using_adagrad': opt_algo == 'adagrad',
        'eta': adagrad_eta,
        'eps': adagrad_eps,
        'rate': learning_rate,
        'verify_gradient': False,
        'debug_mode_on': verbose,
        'ftol': ftol,
        'num_checkpoints': 100
    }

    estimator = est.EmbeddingMAPEstimator(
        regularization_constant=regularization_constant,
        using_scipy=(opt_algo == 'l-bfgs-b'),
        gradient_descent_kwargs=gradient_descent_kwargs,
        verify_gradient=False,
        debug_mode_on=verbose,
        ftol=ftol)

    def build_embedding(embedding_kwargs,
                        estimator,
                        history,
                        filtered_history,
                        split_history=None):

        model = models.EmbeddingModel(history, **embedding_kwargs)

        estimator.filtered_history = filtered_history
        if split_history is not None:
            estimator.split_history = split_history

        model.fit(estimator)

        return model

    model_builders = {
        'model': (lambda *args, **kwargs: build_embedding(
            embedding_kwargs, estimator, *args, **kwargs))
    }

    click.echo(
        'Computing cross-validated AUC (num_folds=%d, truncation_style=%s)...'
        % (num_folds, truncation_style))

    results = evaluate.cross_validated_auc(
        model_builders,
        history,
        num_folds=num_folds,
        random_truncations=(truncation_style == 'random'))

    train_auc_mean = results.training_auc_mean('model')
    val_auc_mean = results.validation_auc_mean('model')

    train_auc_stderr = results.training_auc_stderr('model')
    val_auc_stderr = results.validation_auc_stderr('model')

    click.echo('AUCs with 95% confidence intervals:')
    click.echo('Training AUC = %f (%f, %f)' %
               (train_auc_mean, train_auc_mean - 1.96 * train_auc_stderr,
                train_auc_mean + 1.96 * train_auc_stderr))

    click.echo('Validation AUC = %f (%f, %f)' %
               (val_auc_mean, val_auc_mean - 1.96 * val_auc_stderr,
                val_auc_mean + 1.96 * val_auc_stderr))

    with open(results_file, 'wb') as f:
        pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)

    click.echo('Results written to %s' % results_file)
예제 #6
0
def cli(history_file, model_file, verbose, compute_training_auc, using_lessons,
        using_prereqs, using_bias, embedding_dimension,
        learning_update_variance, opt_algo, regularization_constant, ftol,
        learning_rate, adagrad_eta, adagrad_eps):
    """
    This script provides a command-line interface for model training.
    It reads an interaction history from file, trains an embedding model,
    and writes the model to file.

    :param str history_file: Input path to CSV/pickle file containing interaction history
    :param str model_file: Output path to pickle file containing trained model
    :param bool verbose: True => logger level set to logging.INFO
    :param bool compute_training_auc: True => compute training AUC of model
    :param bool using_lessons: Including lessons in embedding
    :param bool using_prereqs: Including lesson prereqs in embedding
    :param bool using_bias: Including bias terms in embedding
    :param int embedding_dimension: Number of dimensions in latent skill space
    :param float learning_update_variance: Variance of Gaussian learning update
    :param str opt_algo: Optimization algorithm for parameter estimation
    :param float regularization_constant: Coefficient of regularization term in objective function
    :param float ftol: Stopping condition for iterative optimization
    :param float learning_rate: Fixed learning rate for gradient descent
    :param float adagrad_eta: Base learning rate parameter for Adagrad
    :param float adagrad_eps: Epsilon parameter for Adagrad
    """

    if verbose and opt_algo == 'l-bfgs-b':
        raise ValueError(
            'Verbose mode is not currently supported for L-BFGS-B.\
                Try turning off verbose mode, or change your choice of optimization algorithm.'
        )

    if verbose:
        _logger.setLevel(logging.DEBUG)

    click.echo('Loading interaction history from %s...' %
               (click.format_filename(history_file)))

    _, history_file_ext = os.path.splitext(history_file)
    if history_file_ext == '.csv':
        data = pd.DataFrame.from_csv(history_file)
        history = datatools.InteractionHistory(pd.read_csv(history_file))
    elif history_file_ext == '.pkl':
        with open(history_file, 'rb') as f:
            history = pickle.load(f)
    else:
        raise ValueError('Unrecognized file extension for history_file.\
                Please supply a .csv with an interaction history, or a .pkl file containing\
                a datatools.InteractionHistory object.')

    click.echo('Computing MAP estimates of model parameters...')

    model = models.EmbeddingModel(history,
                                  embedding_dimension,
                                  using_lessons=using_lessons,
                                  using_prereqs=using_prereqs,
                                  using_bias=using_bias)

    gradient_descent_kwargs = {
        'using_adagrad': opt_algo == 'adagrad',
        'eta': adagrad_eta,
        'eps': adagrad_eps,
        'rate': learning_rate,
        'verify_gradient': False,
        'debug_mode_on': verbose,
        'ftol': ftol,
        'num_checkpoints': 100
    }

    estimator = est.EmbeddingMAPEstimator(
        regularization_constant=regularization_constant,
        using_scipy=(opt_algo == 'l-bfgs-b'),
        gradient_descent_kwargs=gradient_descent_kwargs,
        verify_gradient=False,
        debug_mode_on=verbose,
        ftol=ftol)

    model.fit(estimator)

    with open(model_file, 'wb') as f:
        pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

    click.echo('Trained model written to %s' %
               click.format_filename(model_file))

    if compute_training_auc:
        click.echo('Training AUC = %f' %
                   evaluate.training_auc(model, history, plot_roc_curve=False))