def experiment_future_program(exp_name, previous_data_files, future_data_file, \ min_occ_user, min_occ_prog, method_list, top_k): ''' experiment entrance for future programs. Top-k precision. Parameters ---------- exp_name: a human-readable experiment name. method_list: a list of recommendation models Returns ---------- ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) exp_id = exp_name + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) mcpl_log('Experimental ID: ' + exp_id) reader = DailyWatchTimeReader() tr_data = reader.read_file_with_minval(previous_data_files, min_occ_user, min_occ_prog) te_data = reader.read_file_with_minval(future_data_file, min_occ_user, min_occ_prog) mcpl_log('Normalization data ...') tr_data.normalize_row() # there is no need to normalize train data because we evaluate the hits. result = {} for method in method_list: # do for each method mcpl_log('Method: ' + method.unique_str()) method_result = experiment_unit_future_program(exp_id, method, tr_data, te_data, top_k) result[method.unique_str()] = method_result mcpl_log('Experiment Done [' + exp_id + ']') return result
def experiment_leave_k_out(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, leave_k_out, total_iteration, top_n, binary = False): ''' Parameters ---------- @param exp_name: the experiment name (prefix) @param daily_datafile: @param min_occ_user: @param method_list: @param leave_k_out: leave k out for each user. The k must be strict less than min_occ_user @param binary: if this is set to true then the binary data is used (non-zero set to 1). Returns ---------- @return out ''' if leave_k_out >= min_occ_user: raise ValueError( 'The k in the leave k out should be strictly less than min_occ_user.' ) # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP) # construct exp_id if binary: exp_id = 'lko_bi_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration) else: exp_id = 'lko_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration) lko_log('Experiment ID: ' + exp_id) # load data. lko_log('Read data...') reader = DailyWatchTimeReader() data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog) lko_log('Data loaded: ' + str(data)) if binary: lko_log('Binarizing data...') data.binarize() else: # normalize lko_log('Normalizing data...') data.normalize_row() result = {} for method in method_list: # do for each method perf_vect = [] for iteration in range(total_iteration): # do for each iteration for each method. lko_log('Method: ' + method.unique_str() + ' Iteration: ' + str(iteration)) # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_lvidx_iter' + str( iteration) split_dir = exp_id + '/lv_idx' leave_k_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir) if not leave_k_out_idx: # randomly generate k items from each row/user. leave_k_out_idx = ds.leave_k_out(data, leave_k_out) URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, leave_k_out_idx, split_dir) # split the k items as a separate. [data_left, data_tr] = data.leave_k_out(leave_k_out_idx) iter_result = experiment_unit_leave_k_out(exp_id, method, \ data_tr, data_left, iteration, top_n) perf_vect.append(iter_result) result[method.unique_str()] = perf_vect return result
Created on Jan 30, 2014 @author: Shiyu C. ([email protected]) ''' from rs.data.daily_watchtime import DailyWatchTimeReader from rs.algorithms.recommendation.TriUHV import TriUHV from rs.data.recdata import FeedbackData if __name__ == '__main__': filename = "../../../datasample/agg_duid_pid_watchtime_genre/20131209_100000" # load data. reader = DailyWatchTimeReader() feedback_data = reader.read_file_with_minval(filename, 1, 1) feedback_data.normalize_row() # build model with 3 latent factors. r = 5 # the L_2 norm regularizer lamb = 0.2 # the stopping delta value delta = 0.01 # the maximium iteration number maxiter = 500 # TriUHV_model = TriUHV(r,lamb,delta,maxiter, verbose = True); # TriUHV_model.train(feedback_data); ''' # test.
Logger.GetInstance().display_level = 10; mcpl_log('Data file: ' + filename); mcpl_log('ROVI daily file: ' + rovi_daily_file); # build ROVI daily mapping mcpl_log('Building ROVI daily mapping'); rovi_mapping = {}; with open(rovi_daily_file) as csvfile: rovi_reader = csv.reader(csvfile, delimiter = '\t', quotechar = '|') for row in rovi_reader: rovi_mapping[row[3]] = row[7]; # load data from file and transform into a sparse matrix. reader = DailyWatchTimeReader(); fbdata = reader.read_file_with_minval(filename, 5, 5); mat = coo_matrix((fbdata.data_val, (fbdata.data_row, fbdata.data_col)), \ shape = (fbdata.num_row, fbdata.num_col)); # memo: if we do multiple days, we might use coo_matrix summation, but we need # to align the program and user. # we have a mapping from program id to row. program_mapping = fbdata.col_mapping; # from which we build a reverse mapping from row id to program # the reverse mapping allows us to find program ID from matrix position. program_inv_mapping = {y: x for x, y in program_mapping.items()}; # check the consistency. if not (len(program_mapping) == len(program_inv_mapping)): raise ValueError('Mapping inverse error!'); program_num = len(program_mapping);
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \ method_list, training_prec, total_iteration): ''' Parameters ---------- exp_name: a human-readable experiment name. method_list: a list of matrix completion models Returns ---------- ''' # define mcpl_log style. mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); #mcpl_log('Data ID: ' + hash(daily_data_file)); # here we use a regular hash. exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration); mcpl_log('Experiment ID: ' + exp_id); # save experiment splitting as resources. reader = DailyWatchTimeReader(); data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog); # we normalize here before splitting. mcpl_log('Normalizing data...'); data.normalize_row(); result = {}; for method in method_list: # do for each method perf_vect = []; for iteration in range(total_iteration): # do for each iteration for each method; mcpl_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration)); # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_split_iter' + str(iteration); split_dir = exp_id + '/split'; split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir); if not split: split = ds.split(data.num_row, training_prec); URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split, split_dir); [split_tr, split_te] = split; data_tr = data.subdata_row(split_tr); data_te = data.subdata_row(split_te); iter_result = experiment_unit_rand_split(exp_id, method, data_tr, data_te, iteration); perf_vect.append(iter_result); result[method.unique_str()] = perf_vect; mcpl_log('Experiment Done [' + exp_id + ']'); return result;
from rs.data.daily_watchtime import DailyWatchTimeReader from rs.algorithms.recommendation.HierLat import HierLat if __name__ == '__main__': # load data. reader = DailyWatchTimeReader(); #filename = "../../../datasample/agg_duid_pid_watchtime_genre/20131209_100000"; #feedback_data = reader.read_file_with_minval(filename, 1, 1); filename = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209/part-r-00000"; #feedback_data = reader.read_file_with_minval(filename, 25, 300); feedback_data = reader.read_file_with_minval(filename, 35, 300); print feedback_data; print 'Maximum Genre.' print np.max(feedback_data.meta['pggr_gr']) + 1; print 'Normalizing data.' feedback_data.normalize_row(); # build model with 3 latent factors. r = 5; # the L_2 norm regularizer lamb = 0.001; # the stopping delta value delta = 0.01;
@author: jiayu.zhou ''' from rs.data.daily_watchtime import DailyWatchTimeReader if __name__ == '__main__': #daily_data_file = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/test_comb/test"; #daily_data_file_p1 = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/test_comb/test1"; #daily_data_file_p2 = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/test_comb/test2"; daily_data_file = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131210/part"; daily_data_file_p1 = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131210/part-1"; daily_data_file_p2 = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131210/part-2"; min_occ_user = 0; min_occ_prog = 0; reader = DailyWatchTimeReader(); #[occur_duid1, occur_pid1, cnt_duid1, cnt_pid1] = reader.read_file_info(daily_data_file); #[occur_duid2, occur_pid2, cnt_duid2, cnt_pid2] = reader.read_file_info([daily_data_file_p1, daily_data_file_p2]); data1 = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog); print data1; #print data1.get_sparse_matrix().todense(); data2 = reader.read_file_with_minval([daily_data_file_p1, daily_data_file_p2], \ min_occ_user, min_occ_prog); print data2; #print data2.get_sparse_matrix().todense();
Created on Feb 4, 2014 @author: jiayu.zhou ''' from rs.data.daily_watchtime import DailyWatchTimeReader from rs.data.recdata import share_row_data; if __name__ == '__main__': reader = DailyWatchTimeReader(); filename1 = "../../datasample/agg_duid_pid_watchtime_genre/toy_small_day1"; filename2 = "../../datasample/agg_duid_pid_watchtime_genre/toy_small_day2"; fb_data1 = reader.read_file_with_minval(filename1, 0, 0); fb_data2 = reader.read_file_with_minval(filename2, 0, 0); print 'Matrix 1' print fb_data1.row_mapping; print fb_data1.col_mapping; print fb_data1.get_sparse_matrix().todense(); print 'Matrix 2' print fb_data2.row_mapping; print fb_data2.col_mapping; print fb_data2.get_sparse_matrix().todense(); # get the share. [fb_data1_share, fb_data2_share] = share_row_data(fb_data1, fb_data2);
num_prog = 3000 total_iteration = 2 iteration = 1 # iteration out of total_iteration. leave_k_out = 20 lafactor = 5 method = HierLat(latent_factor=lafactor) hash_file_str = str(hash(tuple(daily_data_file))) reader = DailyWatchTimeReader() feedback_data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog, num_user, num_prog) exp_id = 'lko_bi_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration) result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration) sub_folder = exp_id + '/models/' + method.unique_str() # use a sub folder to store the experiment resource. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder)
num_user = 10000; num_prog = 3000; total_iteration = 2; iteration = 1; # iteration out of total_iteration. leave_k_out = 20; lafactor = 5; method = HierLat(latent_factor=lafactor); hash_file_str = str(hash(tuple(daily_data_file))); reader = DailyWatchTimeReader(); feedback_data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog, num_user, num_prog); exp_id = 'lko_bi_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration); result_resource_str = 'exp' + exp_id + \ '_method' + method.unique_str() + \ '_iter' + str(iteration); sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder); [method] = trained_model; learnt_genre = method.V;
if __name__ == '__main__': filename = "../../datasample/agg_duid_pid_watchtime_genre/20131209_100000" #filename = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209/part-r-00000"; if len(sys.argv) == 1: print 'Use default sample data.' else: filename = sys.argv[1] print 'processing file', filename reader = DailyWatchTimeReader() #reader.readFile(filename); #[mapping_duid, mapping_pid, row, col, data, pggr_pg, pggr_gr] = \ # reader.read_file_with_minval(filename, 5, 5); dataStruct = reader.read_file_with_minval(filename, 1, 1) print dataStruct #print len(pggr_pg); #print len(pggr_gr); #print len(set(pggr_gr)); #[occur_duid, occur_pid, cnt_duid, cnt_pid] = reader.readFileInfo(filename); #print cnt_duid; #print cnt_pid; #histplot(occur_duid.values()); #histplot(occur_pid.values()); dataStruct.normalize_row() print dataStruct.data_val
def experiment_coldstart_map(exp_name, daily_data_file,\ min_occ_user, min_occ_prog, num_user, num_prog,\ method_list, blind_k_out, total_iteration, max_rank, binary = False): ''' Parameters ---------- @param exp_name: the experiment name (prefix) @param daily_datafile: a list of files. @param min_occ_user: cold start user criteria @param min_occ_prog: cold start user criteria @param num_user: the number of users selected in the experiment. @param num_prog: the number of programs selected in the experiment. @param method_list: @param blind_k_out: leave k out for each user. The k must be strict less than min_occ_user @param binary: if this is set to true then the binary data is used (non-zero set to 1). Returns ---------- @return out ''' print 'Blind k out: k = ', str(blind_k_out); print 'Min_occ_user: '******'Min_occ_prog: ', str(min_occ_prog); if blind_k_out >= min_occ_user: raise ValueError('The k in the leave k out [' + str(blind_k_out) +'] should be strictly less than min_occ_user [' + str(min_occ_user) +'].'); # define lko_log style. lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP); if isinstance(daily_data_file, list): hash_file_str = str(hash(tuple(daily_data_file))); else: hash_file_str = str(hash(daily_data_file)); # construct exp_id if binary: exp_id = 'cst_bi_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration); else: exp_id = 'cst_' + exp_name + '_data' + hash_file_str\ + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \ + '_nu' + str(num_user) + '_np' + str(num_prog) \ + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration); lko_log('Experiment ID: ' + exp_id); # load data. lko_log('Read data...'); reader = DailyWatchTimeReader(); data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog, num_user, num_prog); lko_log('Data loaded: ' + str(data)); if binary: lko_log('Binarizing data...'); data.binarize(); else: # normalize lko_log('Normalizing data...'); data.normalize_row(); result = {}; for method in method_list: # do for each method perf_vect = []; for iteration in range(total_iteration): # do for each iteration for each method. lko_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration)); # data split of the current iteration. split_resource_str = 'exp' + exp_id + '_blind_idx_iter' + str(iteration); split_dir = exp_id + '/blind_idx'; blind_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir); if not blind_out_idx: # randomly generate k items to blind out. blind_out_idx = ds.sample_num(data.num_col, blind_k_out); URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, blind_out_idx, split_dir); lko_log('Blind index done.'); # split the k items as a separate. [data_tr, data_left] = data.blind_k_out(blind_out_idx); lko_log('Start index'); iter_result = experiment_unit_leave_k_out_map(exp_id, method, \ data_tr, data_left, iteration, max_rank); perf_vect.append(iter_result); result[method.unique_str()] = perf_vect; return result;
Created on Feb 4, 2014 @author: jiayu.zhou ''' from rs.data.daily_watchtime import DailyWatchTimeReader from rs.data.recdata import share_row_data if __name__ == '__main__': reader = DailyWatchTimeReader() filename1 = "../../datasample/agg_duid_pid_watchtime_genre/toy_small_day1" filename2 = "../../datasample/agg_duid_pid_watchtime_genre/toy_small_day2" fb_data1 = reader.read_file_with_minval(filename1, 0, 0) fb_data2 = reader.read_file_with_minval(filename2, 0, 0) print 'Matrix 1' print fb_data1.row_mapping print fb_data1.col_mapping print fb_data1.get_sparse_matrix().todense() print 'Matrix 2' print fb_data2.row_mapping print fb_data2.col_mapping print fb_data2.get_sparse_matrix().todense() # get the share. [fb_data1_share, fb_data2_share] = share_row_data(fb_data1, fb_data2)
Logger.GetInstance().display_level = 10 mcpl_log('Data file: ' + filename) mcpl_log('ROVI daily file: ' + rovi_daily_file) # build ROVI daily mapping mcpl_log('Building ROVI daily mapping') rovi_mapping = {} with open(rovi_daily_file) as csvfile: rovi_reader = csv.reader(csvfile, delimiter='\t', quotechar='|') for row in rovi_reader: rovi_mapping[row[3]] = row[7] # load data from file and transform into a sparse matrix. reader = DailyWatchTimeReader() fbdata = reader.read_file_with_minval(filename, 5, 5) mat = coo_matrix((fbdata.data_val, (fbdata.data_row, fbdata.data_col)), \ shape = (fbdata.num_row, fbdata.num_col)) # memo: if we do multiple days, we might use coo_matrix summation, but we need # to align the program and user. # we have a mapping from program id to row. program_mapping = fbdata.col_mapping # from which we build a reverse mapping from row id to program # the reverse mapping allows us to find program ID from matrix position. program_inv_mapping = {y: x for x, y in program_mapping.items()} # check the consistency. if not (len(program_mapping) == len(program_inv_mapping)): raise ValueError('Mapping inverse error!')
""" Created on Jan 31, 2014 @author: jiayu.zhou """ from rs.data.daily_watchtime import DailyWatchTimeReader if __name__ == "__main__": filename = "../../datasample/agg_duid_pid_watchtime_genre/20131209_100000" reader = DailyWatchTimeReader() dataStruct = reader.read_file_with_minval(filename, 7, 1) print dataStruct print dataStruct.row_mapping print dataStruct.get_sparse_matrix().todense() print "-----------------" print ">>>subsample 3 rows" [subdata_row, subidx] = dataStruct.subsample_row(3) print subidx print subdata_row.get_sparse_matrix().todense() print subdata_row.row_mapping print "-----------------" print ">>>subsample 50% rows"
This is a testing pipeline for KDD_2014 algorithm. Created Feb 7, 2014 @author: Shiyu C. ([email protected]) ''' from rs.data.daily_watchtime import DailyWatchTimeReader from rs.algorithms.recommendation.CF_ONMTF import CF_ONMTF if __name__ == '__main__': filename = "../../../datasample/agg_duid_pid_watchtime_genre/20131209_100000"; # load data. reader = DailyWatchTimeReader(); feedback_data = reader.read_file_with_minval(filename, 1, 1); feedback_data.normalize_row(); # build model with 3 latent factors. r = 5; # the L_2 norm regularizer lamb = 0.2; # the stopping delta value delta = 0.01; # the maximium iteration number maxiter = 500; CF_ONMTF_model = CF_ONMTF(r,lamb,delta,maxiter, verbose = True); CF_ONMTF_model.train(feedback_data); ''' # test.
''' Created on Feb 13, 2014 @author: jiayu.zhou ''' import scipy.io as sio from rs.data.daily_watchtime import DailyWatchTimeReader if __name__ == '__main__': daily_data_file = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209/part-r-00000" reader = DailyWatchTimeReader() data = reader.read_file_with_minval(daily_data_file, 1, 1) data_mat = data.get_sparse_matrix() ### directory save sparse matrix data structure to Matlab. #sio.savemat("/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209.mat", {'data': data_mat}) ### sio.savemat("/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209_sparse.mat", {'data': data_mat.data, 'i': data_mat.row, 'j': data_mat.col, \ 'm': data_mat.shape[0], 'n': data_mat.shape[1]}) print 'Done'
''' Created on Feb 13, 2014 @author: jiayu.zhou ''' import scipy.io as sio; from rs.data.daily_watchtime import DailyWatchTimeReader if __name__ == '__main__': daily_data_file = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209/part-r-00000"; reader = DailyWatchTimeReader(); data = reader.read_file_with_minval(daily_data_file, 1, 1); data_mat = data.get_sparse_matrix(); ### directory save sparse matrix data structure to Matlab. #sio.savemat("/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209.mat", {'data': data_mat}) ### sio.savemat("/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209_sparse.mat", {'data': data_mat.data, 'i': data_mat.row, 'j': data_mat.col, \ 'm': data_mat.shape[0], 'n': data_mat.shape[1]}); print 'Done';