def filter_history(history, min_num_ixns=5, max_num_ixns=9223372036854775807): """ Filter history for students with histories of bounded length, and modules with enough interactions :param datatools.InteractionHistory history: An interaction history :param int min_num_ixns: Minimum number of timesteps in student history, and minimum number of interactions for module :param int max_num_ixns: Maximum number of timesteps in student history :rtype: datatools.InteractionHistory :return: A filtered interaction history """ students = set( history.data['student_id'][(history.data['timestep'] > min_num_ixns) & (history.data['module_type'] == datatools. AssessmentInteraction.MODULETYPE)]) students -= set( history.data['student_id'][history.data['timestep'] >= max_num_ixns]) modules = { module_id for module_id, group in history.data.groupby('module_id') if len(group) > min_num_ixns } return datatools.InteractionHistory( history.data[(history.data['student_id'].isin(students)) & (history.data['module_id'].isin(modules))], reindex_timesteps=True, size_of_test_set=0.2)
def interaction_history_from_dutch_big_data_set(data): """ Parse MTurk data set into an interaction history :param pd.DataFrame data: A dataframe of raw log data :rtype: datatools.InteractionHistory :return: An interaction history object """ data = data[['user_id', 'student_id', 'module_id', 'outcome', 'timestamp']] data.columns = [ 'user_id', 'student_id', 'module_id', 'outcome', 'timestamp' ] student_timesteps = defaultdict(int) timesteps = [None] * len(data) for i, (_, ixn) in enumerate(data.iterrows()): student_timesteps[ixn['student_id']] += 1 timesteps[i] = student_timesteps[ixn['student_id']] data['timestep'] = timesteps data['module_type'] = [datatools.AssessmentInteraction.MODULETYPE ] * len(data) return datatools.InteractionHistory(data, sort_by_timestep=True)
def interaction_history_from_mnemosyne_data_set(data): """ Parse Mnemosyne data set into an interaction history :param pd.DataFrame data: A dataframe of raw log data :rtype: datatools.InteractionHistory :return: An interaction history object """ data = data[data['grade'].apply(lambda x: not np.isnan(x))] data = data[[ 'user_id', 'student_id', 'object_id', 'grade', 'timestamp', 'thinking_time', 'actual_interval', 'scheduled_interval' ]] data.columns = [ 'user_id', 'student_id', 'module_id', 'outcome', 'timestamp', 'duration', 'actual_interval', 'scheduled_interval' ] data['outcome'] = data['outcome'].apply(lambda x: x > 1) student_timesteps = defaultdict(int) timesteps = [None] * len(data) for i, (_, ixn) in enumerate(data.iterrows()): student_timesteps[ixn['student_id']] += 1 timesteps[i] = student_timesteps[ixn['student_id']] data['timestep'] = timesteps data['module_type'] = [datatools.AssessmentInteraction.MODULETYPE ] * len(data) return datatools.InteractionHistory(data, sort_by_timestep=True)
def interaction_history_from_assistments_data_set(data, duration_column='timestep', module_id_column='problem_id' ): """ Parse dataframe of assistments interactions into an interaction history :param pd.DataFrame assistments_data: A raw history from assistments :param str duration_column: Column to use as interaction duration :param str module_id_column: Column to use as module_id :rtype: datatools.InteractionHistory :return: An interaction history """ # sort by order_id data.sort_values(by='order_id', inplace=True, axis=0) # get relevant columns and rename them data = data[['user_id', 'correct', duration_column, module_id_column]] data.columns = ['user_id', 'outcome', 'duration', 'module_id'] # only keep interactions with binary outcomes and positive response times data = data[((data['outcome'] == 1) | (data['outcome'] == 0)) & (data['duration'] > 0)] # cast outcomes from int to bool data['outcome'] = data['outcome'].apply(lambda x: x == 1) # map response times from milliseconds to seconds data['duration'] = data['duration'].apply(lambda x: x / 1000) # existing interactions are all assessment interactions data['module_type'] = [datatools.AssessmentInteraction.MODULETYPE ] * len(data) # add timesteps timesteps = [None] * len(data) student_timesteps = defaultdict(int) for i, (_, ixn) in enumerate(data.iterrows()): student_timesteps[ixn['user_id']] += 1 timesteps[i] = student_timesteps[ixn['user_id']] data['timestep'] = timesteps # add artificial lesson interactions lesson_data = data.copy(deep=True) lesson_data['module_type'] = [datatools.LessonInteraction.MODULETYPE ] * len(data) return datatools.InteractionHistory(pd.concat([data, lesson_data]), sort_by_timestep=True)
def cli(history_file, results_file, verbose, num_folds, truncation_style, using_lessons, using_prereqs, using_bias, embedding_dimension, learning_update_variance, opt_algo, regularization_constant, ftol, learning_rate, adagrad_eta, adagrad_eps): """ This script provides a command-line interface for model evaluation. It reads an interaction history from file, computes the cross-validated AUC of an embedding model, and writes the results to file. The pickled results will be an object of type :py:class:`evaluate.CVResults` :param str history_file: Input path to CSV/pickle file containing interaction history :param str results_file: Output path for pickled results of cross-validation :param bool verbose: True => logger level set to logging.INFO :param int num_folds: Number of folds in k-fold cross-validation :param str truncation_style: Hold-out scheme for student histories :param bool using_lessons: Including lessons in embedding :param bool using_prereqs: Including lesson prereqs in embedding :param bool using_bias: Including bias terms in embedding :param int embedding_dimension: Number of dimensions in latent skill space :param float learning_update_variance: Variance of Gaussian learning update :param str opt_algo: Optimization algorithm for parameter estimation :param float regularization_constant: Coefficient of regularization term in objective function :param float ftol: Stopping condition for iterative optimization :param float learning_rate: Fixed learning rate for gradient descent :param float adagrad_eta: Base learning rate parameter for Adagrad :param float adagrad_eps: Epsilon parameter for Adagrad """ if verbose and opt_algo == 'l-bfgs-b': raise ValueError( 'Verbose mode is not currently supported for L-BFGS-B.\ Try turning off verbose mode, or change your choice of optimization algorithm.' ) if verbose: _logger.setLevel(logging.DEBUG) click.echo('Loading interaction history from %s...' % click.format_filename(history_file)) _, history_file_ext = os.path.splitext(history_file) if history_file_ext == '.csv': data = pd.DataFrame.from_csv(history_file) history = datatools.InteractionHistory(pd.read_csv(history_file)) elif history_file_ext == '.pkl': with open(history_file, 'rb') as f: history = pickle.load(f) else: raise ValueError('Unrecognized file extension for history_file.\ Please supply a .csv with an interaction history, or a .pkl file containing\ a datatools.InteractionHistory object.') embedding_kwargs = { 'embedding_dimension': embedding_dimension, 'using_lessons': using_lessons, 'using_prereqs': using_prereqs, 'using_bias': using_bias, 'learning_update_variance_constant': learning_update_variance } gradient_descent_kwargs = { 'using_adagrad': opt_algo == 'adagrad', 'eta': adagrad_eta, 'eps': adagrad_eps, 'rate': learning_rate, 'verify_gradient': False, 'debug_mode_on': verbose, 'ftol': ftol, 'num_checkpoints': 100 } estimator = est.EmbeddingMAPEstimator( regularization_constant=regularization_constant, using_scipy=(opt_algo == 'l-bfgs-b'), gradient_descent_kwargs=gradient_descent_kwargs, verify_gradient=False, debug_mode_on=verbose, ftol=ftol) def build_embedding(embedding_kwargs, estimator, history, filtered_history, split_history=None): model = models.EmbeddingModel(history, **embedding_kwargs) estimator.filtered_history = filtered_history if split_history is not None: estimator.split_history = split_history model.fit(estimator) return model model_builders = { 'model': (lambda *args, **kwargs: build_embedding( embedding_kwargs, estimator, *args, **kwargs)) } click.echo( 'Computing cross-validated AUC (num_folds=%d, truncation_style=%s)...' % (num_folds, truncation_style)) results = evaluate.cross_validated_auc( model_builders, history, num_folds=num_folds, random_truncations=(truncation_style == 'random')) train_auc_mean = results.training_auc_mean('model') val_auc_mean = results.validation_auc_mean('model') train_auc_stderr = results.training_auc_stderr('model') val_auc_stderr = results.validation_auc_stderr('model') click.echo('AUCs with 95% confidence intervals:') click.echo('Training AUC = %f (%f, %f)' % (train_auc_mean, train_auc_mean - 1.96 * train_auc_stderr, train_auc_mean + 1.96 * train_auc_stderr)) click.echo('Validation AUC = %f (%f, %f)' % (val_auc_mean, val_auc_mean - 1.96 * val_auc_stderr, val_auc_mean + 1.96 * val_auc_stderr)) with open(results_file, 'wb') as f: pickle.dump(results, f, pickle.HIGHEST_PROTOCOL) click.echo('Results written to %s' % results_file)
def cli(history_file, model_file, verbose, compute_training_auc, using_lessons, using_prereqs, using_bias, embedding_dimension, learning_update_variance, opt_algo, regularization_constant, ftol, learning_rate, adagrad_eta, adagrad_eps): """ This script provides a command-line interface for model training. It reads an interaction history from file, trains an embedding model, and writes the model to file. :param str history_file: Input path to CSV/pickle file containing interaction history :param str model_file: Output path to pickle file containing trained model :param bool verbose: True => logger level set to logging.INFO :param bool compute_training_auc: True => compute training AUC of model :param bool using_lessons: Including lessons in embedding :param bool using_prereqs: Including lesson prereqs in embedding :param bool using_bias: Including bias terms in embedding :param int embedding_dimension: Number of dimensions in latent skill space :param float learning_update_variance: Variance of Gaussian learning update :param str opt_algo: Optimization algorithm for parameter estimation :param float regularization_constant: Coefficient of regularization term in objective function :param float ftol: Stopping condition for iterative optimization :param float learning_rate: Fixed learning rate for gradient descent :param float adagrad_eta: Base learning rate parameter for Adagrad :param float adagrad_eps: Epsilon parameter for Adagrad """ if verbose and opt_algo == 'l-bfgs-b': raise ValueError( 'Verbose mode is not currently supported for L-BFGS-B.\ Try turning off verbose mode, or change your choice of optimization algorithm.' ) if verbose: _logger.setLevel(logging.DEBUG) click.echo('Loading interaction history from %s...' % (click.format_filename(history_file))) _, history_file_ext = os.path.splitext(history_file) if history_file_ext == '.csv': data = pd.DataFrame.from_csv(history_file) history = datatools.InteractionHistory(pd.read_csv(history_file)) elif history_file_ext == '.pkl': with open(history_file, 'rb') as f: history = pickle.load(f) else: raise ValueError('Unrecognized file extension for history_file.\ Please supply a .csv with an interaction history, or a .pkl file containing\ a datatools.InteractionHistory object.') click.echo('Computing MAP estimates of model parameters...') model = models.EmbeddingModel(history, embedding_dimension, using_lessons=using_lessons, using_prereqs=using_prereqs, using_bias=using_bias) gradient_descent_kwargs = { 'using_adagrad': opt_algo == 'adagrad', 'eta': adagrad_eta, 'eps': adagrad_eps, 'rate': learning_rate, 'verify_gradient': False, 'debug_mode_on': verbose, 'ftol': ftol, 'num_checkpoints': 100 } estimator = est.EmbeddingMAPEstimator( regularization_constant=regularization_constant, using_scipy=(opt_algo == 'l-bfgs-b'), gradient_descent_kwargs=gradient_descent_kwargs, verify_gradient=False, debug_mode_on=verbose, ftol=ftol) model.fit(estimator) with open(model_file, 'wb') as f: pickle.dump(model, f, pickle.HIGHEST_PROTOCOL) click.echo('Trained model written to %s' % click.format_filename(model_file)) if compute_training_auc: click.echo('Training AUC = %f' % evaluate.training_auc(model, history, plot_roc_curve=False))