def split(self, percentage): ''' get a random splitting of data with a specified proportion of rows. NOTE: it is recommended to use subdata_row method in get deterministic splits. Parameters ---------- @param percentage: the percentage of data split. Returns ---------- @return out: a list [data_split, data_split_comp, selidx_split, selidx_split_comp] data_split: a Feedback data of percentage, whose index (in the full data set) is given in selidx_split data_split_comp: a Feedback data of 1- percentage, whose index is given in data_split_comp. This is the complement part of data_split. selidx_split: the index of rows in data_split. selidx_split_comp: the index of rows in data_split_comp. ''' # obtain the indices of the split / complement of the split. [selidx_split, selidx_split_comp] = ds.split(self.num_row, percentage); # acquire data from the splits. data_split = self.subdata_row(selidx_split); data_split_comp = self.subdata_row(selidx_split_comp); return [data_split, data_split_comp, selidx_split, selidx_split_comp];
def split(self, percentage): ''' get a random splitting of data with a specified proportion of rows. NOTE: it is recommended to use subdata_row method in get deterministic splits. Parameters ---------- @param percentage: the percentage of data split. Returns ---------- @return out: a list [data_split, data_split_comp, selidx_split, selidx_split_comp] data_split: a Feedback data of percentage, whose index (in the full data set) is given in selidx_split data_split_comp: a Feedback data of 1- percentage, whose index is given in data_split_comp. This is the complement part of data_split. selidx_split: the index of rows in data_split. selidx_split_comp: the index of rows in data_split_comp. ''' # obtain the indices of the split / complement of the split. [selidx_split, selidx_split_comp] = ds.split(self.num_row, percentage) # acquire data from the splits. data_split = self.subdata_row(selidx_split) data_split_comp = self.subdata_row(selidx_split_comp) return [data_split, data_split_comp, selidx_split, selidx_split_comp]
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, training_prec, total_iteration): ''' Parameters ---------- exp_name: a human-readable experiment name. method_list: a list of matrix completion models Returns ---------- ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) #mcpl_log('Data ID: ' + hash(daily_data_file)); # here we use a regular hash. exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration) mcpl_log('Experiment ID: ' + exp_id) # save experiment splitting as resources. reader = UtilityDataReader() data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog) # we normalize here before splitting. mcpl_log('Normalizing data...') data.normalize_row() result = {} for method in method_list: # do for each method perf_vect = [] for iteration in range(total_iteration): # do for each iteration for each method; mcpl_log('Method: ' + method.unique_str() + ' Iteration: ' + str(iteration)) # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_split_iter' + str( iteration) split_dir = exp_id + '/split' split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir) if not split: split = ds.split(data.num_row, training_prec) URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split, split_dir) [split_tr, split_te] = split data_tr = data.subdata_row(split_tr) data_te = data.subdata_row(split_te) iter_result = experiment_unit_rand_split(exp_id, method, data_tr, data_te, iteration) perf_vect.append(iter_result) result[method.unique_str()] = perf_vect mcpl_log('Experiment Done [' + exp_id + ']') return result
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, training_prec, total_iteration): ''' Parameters ---------- exp_name: a human-readable experiment name. method_list: a list of matrix completion models Returns ---------- ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); #mcpl_log('Data ID: ' + hash(daily_data_file)); # here we use a regular hash. exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration); mcpl_log('Experiment ID: ' + exp_id); # save experiment splitting as resources. reader = DailyWatchTimeReader(); data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog); # we normalize here before splitting. mcpl_log('Normalizing data...'); data.normalize_row(); result = {}; for method in method_list: # do for each method perf_vect = []; for iteration in range(total_iteration): # do for each iteration for each method; mcpl_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration)); # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_split_iter' + str(iteration); split_dir = exp_id + '/split'; split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir); if not split: split = ds.split(data.num_row, training_prec); URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split, split_dir); [split_tr, split_te] = split; data_tr = data.subdata_row(split_tr); data_te = data.subdata_row(split_te); iter_result = experiment_unit_rand_split(exp_id, method, data_tr, data_te, iteration); perf_vect.append(iter_result); result[method.unique_str()] = perf_vect; mcpl_log('Experiment Done [' + exp_id + ']'); return result;