def experiment_unit_rand_split(exp_id, method, tr_data, te_data, iteration): ''' One iteration of training and testing. The experimental ID ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration); sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder); if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. mcpl_log('training models...'); method.train(tr_data); # save resource trained_model = [method]; URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder); # compute performance on test data using the model. [method] = trained_model; mcpl_log('computing evaluation metrics on the test data...'); eval_result = rmse(te_data.data_val, method.predict(te_data.data_row, te_data.data_col)); return eval_result;
def experiment_unit_rand_split(exp_id, method, tr_data, te_data, iteration): ''' One iteration of training and testing. The experimental ID ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration) sub_folder = exp_id + '/models/' + method.unique_str() # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder) if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. mcpl_log('training models...') method.train(tr_data) # save resource trained_model = [method] URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder) # compute performance on test data using the model. [method] = trained_model mcpl_log('computing evaluation metrics on the test data...') eval_result = rmse(te_data.data_val, method.predict(te_data.data_row, te_data.data_col)) return eval_result
def experiment_unit_leave_k_out(exp_id, method, data_tr, data_left, iteration, top_n): ''' This method works on the column/row index of the data_tr and data_left, and the data_tr and data_left must be completely aligned in both row-wise and column-wise. ''' # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration) sub_folder = exp_id + '/models/' + method.unique_str() # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder) if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. lko_log('training models...') method.train(data_tr) # save resource trained_model = [method] URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder) # compute performance on test data using the model. [method] = trained_model lko_log('computing evaluation metrics on the test data...') eval_result = {} # ranked list. col_num = data_left.num_col pred_col = range(col_num) tr_data_csr = data_tr.get_sparse_matrix().tocsr() lo_data_csr = data_left.get_sparse_matrix().tocsr() for user_idx in range(data_left.num_row): # predict the entire row. #pred_row = [user_idx] * col_num; #row_pred = method.predict(pred_row, pred_col); row_pred = method.predict_row(user_idx, pred_col) # rank the column (the result is a list of indices). srt_col = [ k[0] for k in sorted( enumerate(row_pred), key=lambda x: x[1], reverse=True) ] # trained columns. tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist()) # remove the trained column. te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col] # top - k (safeguard) te_topk_col = te_srt_col[:min(top_n, len(te_srt_col) - 1)] # test column index; lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist()) prec = precision_itemlist(te_topk_col, lo_col) rec = recall_itemlist(te_topk_col, lo_col) eval_result['prec'] = prec eval_result['recall'] = rec eval_result['rmse'] = rmse( data_left.data_val, method.predict(data_left.data_row, data_left.data_col)) return eval_result
def experiment_unit_leave_k_out(exp_id, method, data_tr, data_left, iteration, top_n): ''' This method works on the column/row index of the data_tr and data_left, and the data_tr and data_left must be completely aligned in both row-wise and column-wise. ''' # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration); sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder); if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. lko_log('training models...'); method.train(data_tr); # save resource trained_model = [method]; URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder); # compute performance on test data using the model. [method] = trained_model; lko_log('computing evaluation metrics on the test data...'); eval_result = {}; # ranked list. col_num = data_left.num_col; pred_col = range(col_num); tr_data_csr = data_tr.get_sparse_matrix().tocsr(); lo_data_csr = data_left.get_sparse_matrix().tocsr(); for user_idx in range(data_left.num_row): # predict the entire row. #pred_row = [user_idx] * col_num; #row_pred = method.predict(pred_row, pred_col); row_pred = method.predict_row(user_idx, pred_col); # rank the column (the result is a list of indices). srt_col = [k[0] for k in sorted(enumerate(row_pred), key=lambda x:x[1], reverse=True)]; # trained columns. tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist()); # remove the trained column. te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col]; # top - k (safeguard) te_topk_col = te_srt_col[:min(top_n, len(te_srt_col)-1)]; # test column index; lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist()); prec = precision_itemlist (te_topk_col, lo_col); rec = recall_itemlist (te_topk_col, lo_col); eval_result['prec'] = prec; eval_result['recall'] = rec; eval_result['rmse'] = rmse(data_left.data_val, method.predict(data_left.data_row, data_left.data_col)); return eval_result;
def experiment_unit_leave_k_out_map(exp_id, method, data_tr, data_left, iteration, max_rank): ''' This method works on the column/row index of the data_tr and data_left, and the data_tr and data_left must be completely aligned in both row-wise and column-wise. ''' # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration); sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. # check resource for existing model. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder); if not trained_model: # train model using the training data. # NOTE: this is the most time-consuming part. lko_log('training models...'); method.train(data_tr); # save resource trained_model = [method]; URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder); # compute performance on test data using the model. [method] = trained_model; lko_log('computing evaluation metrics on the test data...'); eval_result = {}; # ranked list. col_num = data_left.num_col; pred_col = range(col_num); tr_data_csr = data_tr.get_sparse_matrix().tocsr(); lo_data_csr = data_left.get_sparse_matrix().tocsr(); perf_vect_prec = np.zeros(max_rank); # precision perf_vect_rec = np.zeros(max_rank); # recall perf_vect_hr = np.zeros(max_rank); # hit rate (Modification of Xia Ning's Paper) for user_idx in range(data_left.num_row): # predict the entire row. # test column index; lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist()); # there is no testing on this user. if len(lo_col) == 0: continue; #pred_row = [user_idx] * col_num; #row_pred = method.predict(pred_row, pred_col); row_pred = method.predict_row(user_idx, pred_col); # rank the column (the result is a list of indices). srt_col = [k[0] for k in sorted(enumerate(row_pred), key=lambda x:x[1], reverse=True)]; # trained columns. tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist()); # remove the trained column from prediction. # this contains a set of indices that predicted (excluding training items). te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col]; #max_rank will result in an array of 0:max_rank-1; hit = 0; # the hit variable keeps track of the number of hits till the current rank. for rk in range(max_rank): # if rk is greater than the length of te_srt_col, then continue; # if not, detect possible hits. # a hit is defined by items hits if (rk < len(te_srt_col)) and (te_srt_col[rk] in lo_col): hit += 1; perf_vect_hr[rk] += float(hit)/len(lo_col); # hit rate perf_vect_prec[rk] += float(hit)/(rk+1); # precision perf_vect_rec[rk] += float(hit)/len(lo_col); # recall #normalization over users. perf_vect_hr = perf_vect_hr/data_left.num_row; perf_vect_prec = perf_vect_prec/data_left.num_row; perf_vect_rec = perf_vect_rec/data_left.num_row; eval_result['hit_rate'] = perf_vect_hr; eval_result['precision'] = perf_vect_prec; eval_result['recall'] = perf_vect_rec; eval_result['RMSE'] = rmse(data_left.data_val, method.predict(data_left.data_row, data_left.data_col)); return eval_result;