def naive_baseline(omitMovie=True): movieDataBase = MovieDataBase() movieDataBase.generate_complete_rating_data(regenerate=False) movieDataBase.make_sorted_rating_data([5, 1, 0]) sorted_rating_data = movieDataBase.out_rating_data[:] movieDataBase.synchronize() movieDataBase.make_slice(count_movie_slice=range(1, 1201)) movieDataBase.synchronize() movieDataBase.store_data_to_file(fileName='train_original_data') movieDataBase.generate_libfm_data(omitMovie=omitMovie) movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'train_step1.libfm') movieDataBase.load_core_rating_data(sorted_rating_data) movieDataBase.make_slice(count_movie_slice=range(1201, 1441)) movieDataBase.synchronize() movieDataBase.store_data_to_file(fileName='test_original_data') movieDataBase.generate_libfm_data(omitMovie=omitMovie, shuffle=False) movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'test_step1.libfm') # subprocess.call("./Generate/libFM -task r -train Generate/train_step1.libfm -test Generate/test_step1.libfm " # "-method sgd -dim '1,1, 80' -learn_rate 0.001 -iter 160 -out Generate/prediction", shell=True) subprocess.call("./Generate/libFM -task r -train Generate/train_step1.libfm -test Generate/test_step1.libfm " "-method mcmc -dim '1,1, 80' -out Generate/prediction", shell=True) compute_error.computer_error()
def baseline1_random(active_learning_ratio=0.1): print '\n===================================' print 'baseline1 with random choosing active learning data started' print '===================================' """ Choose active learning data randomly """ number_of_active_data = (int(dataProcess.movieDataBase.TOTAL_USERS*active_learning_ratio)-1)*240 number_of_index = len(dataProcess.test_addAllNegative_data) selected_data_positions = set() while len(selected_data_positions) < number_of_active_data: selected_data_positions.add(random.randint(0, number_of_index-1)) alternative_user_movie_list = [] for index in selected_data_positions: userId = dataProcess.test_addAllNegative_data[index][0] movieId = dataProcess.test_addAllNegative_data[index][1] alternative_user_movie_list.append([userId, movieId]) # movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list) """ add active learning result into train_original_data """ print '\n===================================' print 'active learning regression started' print '===================================' dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data) active_learning_train_data = [] # this scheme is adding every thing in active learning count = 0 for values in alternative_user_movie_list: userId = values[0] movieId = values[1] ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId) if ratings is not None: rating = ratings.get(movieId) if rating is not None: active_learning_train_data.append([userId, movieId, rating]) count += 1 train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data dataProcess.movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data') dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data) dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName='train_step3.libfm') # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " # "-method mcmc -out Generate/prediction", shell=True) # compute_error.computer_error() subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 30 -out Generate/prediction", shell=True) print 'number of alternative user-movie requests=', len(alternative_user_movie_list) print 'number of gained active learning user-movie data=', count compute_error.computer_error()
def baseline2_random_after_classification(active_learning_ratio=0.1): print "\n===================================" print "baseline2 random choosing active learning data after classification started" print "===================================" """the test data is still in movieDataBase.core. Next is the step 1 """ print "\n===================================" print "step 1 binary classification started" print "===================================" subprocess.call( "./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " "-method mcmc -iter 1 -out Generate/prediction", shell=True, ) # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " # "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True) selected_data_positions = compute_classification.compute_classification(len(dataProcess.test_original_data)) positive_user_id_movie_id = numpy.zeros( [dataProcess.movieDataBase.TOTAL_USERS + 1, dataProcess.movieDataBase.TOTAL_MOVIES + 1], dtype=numpy.bool_ ) for index in selected_data_positions: userId = dataProcess.test_addAllNegative_data[index][0] movieId = dataProcess.test_addAllNegative_data[index][1] positive_user_id_movie_id[userId][movieId] = numpy.True_ alternative_user_movie_list = [] for choose_movie_num in range(1, dataProcess.TEST_MOVIES + 1): movieId = dataProcess.test_from_movie_num_get_movie_id[choose_movie_num] choose_user_id_set = set() """first add all positive users""" for userId in range(1, dataProcess.movieDataBase.TOTAL_USERS + 1): if positive_user_id_movie_id[userId][movieId] == numpy.True_: choose_user_id_set.add(userId) if len(choose_user_id_set) > dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio: choose_user_id_set = list(choose_user_id_set)[ : int(dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio) ] else: while len(choose_user_id_set) < dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio: choose_user_id_set.add(random.randint(1, dataProcess.movieDataBase.TOTAL_USERS)) choose_user_id_set = list(choose_user_id_set)[ : int(dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio) - 1 ] for choose_user_id in choose_user_id_set: alternative_user_movie_list.append([choose_user_id, movieId]) """ add active learning result into train_original_data """ print "\n===================================" print "step active learning regression started" print "===================================" dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data) active_learning_train_data = [] # this scheme is adding every thing in active learning count = 0 for values in alternative_user_movie_list: userId = values[0] movieId = values[1] ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId) if ratings is not None: rating = ratings.get(movieId) if rating is not None: active_learning_train_data.append([userId, movieId, rating]) count += 1 train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data dataProcess.movieDataBase.store_data_to_file( train_add_active_learning_data, fileName="train_add_active_learning_data" ) dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data) dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName="train_step3.libfm") # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " # "-method mcmc -out Generate/prediction", shell=True) # compute_error.computer_error() subprocess.call( "./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " "-method sgd -dim '1,1, 200' -learn_rate 0.01 -iter 30 -out Generate/prediction", shell=True, ) print "number of alternative user-movie requests=", len(alternative_user_movie_list) print "number of gained active learning user-movie data=", count compute_error.computer_error()
def experiment2_user_qualification_invited_limitation(active_learning_ratio=0.1, MAX_INVITED=50): print "\n===================================" print "experiment2 choosing qualified active learning data after classification started with invitation limit" print "===================================" """the test data is still in movieDataBase.core. Next is the step 1 """ print "\n===================================" print "step 1 binary classification started" print "===================================" subprocess.call( "./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " "-method mcmc -iter 1 -out Generate/prediction", shell=True, ) # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " # "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True) selected_data_positions = compute_classification.compute_classification(len(dataProcess.test_original_data)) print "\n===================================" print "choosing users with low rating MSE" print "===================================" positive_user_id_movie_id = numpy.zeros( [dataProcess.movieDataBase.TOTAL_USERS + 1, dataProcess.movieDataBase.TOTAL_MOVIES + 1], dtype=numpy.bool_ ) for index in selected_data_positions: userId = dataProcess.test_addAllNegative_data[index][0] movieId = dataProcess.test_addAllNegative_data[index][1] positive_user_id_movie_id[userId][movieId] = numpy.True_ user_mse_list = list(enumerate(dataProcess.movieDataBase.user_MSE)) user_mse_sorted_list = sorted(user_mse_list, key=lambda x: x[1]) user_invited_times = [0] * (dataProcess.movieDataBase.TOTAL_USERS + 1) alternative_user_movie_list = [] """ notice that I shuffle the list here""" movie_choose_list = range(1, dataProcess.TEST_MOVIES + 1) random.shuffle(movie_choose_list) for choose_movie_num in movie_choose_list: movieId = dataProcess.test_from_movie_num_get_movie_id[choose_movie_num] choose_user_id_set = set() """first add qualified and positive users""" for userId, _ in user_mse_sorted_list: if positive_user_id_movie_id[userId][movieId] == numpy.True_ and user_invited_times[userId] < MAX_INVITED: choose_user_id_set.add(userId) if len(choose_user_id_set) > dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio: choose_user_id_set = list(choose_user_id_set)[ : int(dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio) ] else: """second add positive users""" for userId, _ in user_mse_sorted_list: if ( user_invited_times[userId] < MAX_INVITED and positive_user_id_movie_id[userId][movieId] == numpy.True_ ): choose_user_id_set.add(userId) if len(choose_user_id_set) >= dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio: break """third add users with low mse""" for userId, _ in user_mse_sorted_list: if user_invited_times[userId] < MAX_INVITED: choose_user_id_set.add(userId) if len(choose_user_id_set) >= dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio: break choose_user_id_set = list(choose_user_id_set)[ : int(dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio) - 1 ] # print len(choose_user_id_set) for choose_user_id in choose_user_id_set: alternative_user_movie_list.append([choose_user_id, movieId]) user_invited_times[choose_user_id] += 1 """ add active learning result into train_original_data """ print "\n===================================" print "step active learning regression started" print "===================================" dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data) active_learning_train_data = [] # this scheme is adding every thing in active learning count = 0 for values in alternative_user_movie_list: userId = values[0] movieId = values[1] ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId) if ratings is not None: rating = ratings.get(movieId) if rating is not None: active_learning_train_data.append([userId, movieId, rating]) count += 1 train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data dataProcess.movieDataBase.store_data_to_file( train_add_active_learning_data, fileName="train_add_active_learning_data" ) dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data) dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName="train_step3.libfm") # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " # "-method mcmc -out Generate/prediction", shell=True) # compute_error.computer_error() subprocess.call( "./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " "-method sgd -dim '1,1, 200' -learn_rate 0.01 -iter 30 -out Generate/prediction", shell=True, ) print "number of alternative user-movie requests=", len(alternative_user_movie_list) print "number of gained active learning user-movie data=", count compute_error.computer_error()
def baseline1_random(active_learning_ratio=0.1): print "\n===================================" print "baseline1 with random choosing active learning data started" print "===================================" """ Choose active learning data randomly """ number_of_index = len(dataProcess.test_addAllNegative_data) alternative_user_movie_list = [] for choose_movie_num in range(1, dataProcess.TEST_MOVIES + 1): choose_user_id_set = set() while len(choose_user_id_set) < dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio: choose_user_id_set.add(random.randint(1, dataProcess.movieDataBase.TOTAL_USERS)) choose_user_id_set = list(choose_user_id_set)[ : int(dataProcess.movieDataBase.TOTAL_USERS * active_learning_ratio) - 1 ] for choose_user_id in choose_user_id_set: alternative_user_movie_list.append( [choose_user_id, dataProcess.test_from_movie_num_get_movie_id[choose_movie_num]] ) # movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list) """ add active learning result into train_original_data """ print "\n===================================" print "active learning regression started" print "===================================" dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data) active_learning_train_data = [] # this scheme is adding every thing in active learning count = 0 for values in alternative_user_movie_list: userId = values[0] movieId = values[1] ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId) if ratings is not None: rating = ratings.get(movieId) if rating is not None: active_learning_train_data.append([userId, movieId, rating]) count += 1 train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data dataProcess.movieDataBase.store_data_to_file( train_add_active_learning_data, fileName="train_add_active_learning_data" ) dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data) dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName="train_step3.libfm") # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " # "-method mcmc -out Generate/prediction", shell=True) # compute_error.computer_error() subprocess.call( "./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 30 -out Generate/prediction", shell=True, ) print "number of alternative user-movie requests=", len(alternative_user_movie_list) print "number of gained active learning user-movie data=", count compute_error.computer_error()
def experiment1(): movieDataBase = MovieDataBase() movieDataBase.generate_complete_rating_data(regenerate=False) movieDataBase.make_sorted_rating_data([5, 1, 0]) sorted_rating_data = movieDataBase.out_rating_data[:] movieDataBase.synchronize() movieDataBase.make_slice(count_movie_slice=range(1, 1201)) movieDataBase.synchronize() train_original_data = movieDataBase.core_rating_data[:] movieDataBase.store_data_to_file(fileName='train_original_data') movieDataBase.generate_libfm_data(omitMovie=True) movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'train_step2.libfm') movieDataBase.add_negative_data() movieDataBase.synchronize() train_addNegative_data = movieDataBase.core_rating_data[:] movieDataBase.store_data_to_file(fileName='train_addNegative_data') movieDataBase.generate_libfm_data(omitMovie=True) movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'train_step1.libfm') movieDataBase.load_core_rating_data(sorted_rating_data) movieDataBase.make_slice(count_movie_slice=range(1201, 1441)) movieDataBase.synchronize() test_original_data = movieDataBase.core_rating_data[:] movieDataBase.store_data_to_file(fileName='test_original_data') movieDataBase.generate_libfm_data(shuffle=False) movieDataBase.store_data_to_file(movieDataBase.libfm_data, fileName='test_step3.libfm') movieDataBase.add_negative_data(addAllUsers=True) movieDataBase.synchronize() test_addAllNegative_data = movieDataBase.core_rating_data[:] movieDataBase.store_data_to_file(fileName='test_addAllNegative_data') movieDataBase.generate_libfm_data(omitMovie=True, shuffle=False) movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'test_step1.libfm') movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'test_step2.libfm') """the test data is still in movieDataBase.core. Next is the step 1 """ subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " "-method mcmc -out Generate/prediction", shell=True) # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " # "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True) selected_data_positions = compute_classification.compute_classification(len(test_original_data)) alternative_user_movie_list = [] for index in selected_data_positions: userId = movieDataBase.core_rating_data[index][0] movieId = movieDataBase.core_rating_data[index][1] alternative_user_movie_list.append([userId, movieId]) movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list) """ step 2 regression""" # subprocess.call("./Generate/libFM -task r -train Generate/train_step2.libfm -test Generate/test_step2.libfm " # "-method mcmc -out Generate/prediction", shell=True) subprocess.call("./Generate/libFM -task r -train Generate/train_step2.libfm -test Generate/test_step2.libfm " "-method sgd -learn_rate 0.001 -iter 70 -out Generate/prediction", shell=True) """ step 3 add active learning result into train_original_data """ movieDataBase.make_user_movie_rating_matrix(test_original_data) active_learning_train_data = [] # this scheme is adding every thing in active learning count = 0 for values in alternative_user_movie_list: userId = values[0] movieId = values[1] ratings = movieDataBase.user_movie_rating_matrix.get(userId) if ratings is not None: rating = ratings.get(movieId) if rating is not None: active_learning_train_data.append([userId, movieId, rating]) count += 1 train_add_active_learning_data = train_original_data + active_learning_train_data movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data') movieDataBase.generate_libfm_data(train_add_active_learning_data) movieDataBase.store_data_to_file(movieDataBase.libfm_data, fileName='train_step3.libfm') # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " # "-method mcmc -out Generate/prediction", shell=True) # compute_error.computer_error() subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " "-method sgd -dim '1,1, 80' -learn_rate 0.001 -iter 160 -out Generate/prediction", shell=True) compute_error.computer_error()
def baseline2_random_after_classification(active_learning_ratio=0.1): print '\n===================================' print 'baseline2 random choosing active learning data after classification started' print '===================================' """the test data is still in movieDataBase.core. Next is the step 1 """ print '\n===================================' print 'step 1 binary classification started' print '===================================' subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " "-method mcmc -iter 1 -out Generate/prediction", shell=True) # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " # "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True) selected_data_positions_tmp = compute_classification.compute_classification(len(dataProcess.test_original_data)) number_of_active_data = (int(dataProcess.movieDataBase.TOTAL_USERS*active_learning_ratio)-1)*240 number_of_index = len(dataProcess.test_addAllNegative_data) selected_data_positions = set() while len(selected_data_positions) < number_of_active_data: """ next line is wrong but has good experiment behavior i want to know why.""" # selected_data_positions.add(random.randint(0, len(selected_data_positions_tmp)-1)) selected_data_positions.add(random.choice(selected_data_positions_tmp)) alternative_user_movie_list = [] for index in selected_data_positions: userId = dataProcess.test_addAllNegative_data[index][0] movieId = dataProcess.test_addAllNegative_data[index][1] alternative_user_movie_list.append([userId, movieId]) # movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list) """ add active learning result into train_original_data """ print '\n===================================' print 'step active learning regression started' print '===================================' dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data) active_learning_train_data = [] # this scheme is adding every thing in active learning count = 0 for values in alternative_user_movie_list: userId = values[0] movieId = values[1] ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId) if ratings is not None: rating = ratings.get(movieId) if rating is not None: active_learning_train_data.append([userId, movieId, rating]) count += 1 train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data dataProcess.movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data') dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data) dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName='train_step3.libfm') # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " # "-method mcmc -out Generate/prediction", shell=True) # compute_error.computer_error() subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 120 -out Generate/prediction", shell=True) print 'number of alternative user-movie requests=', len(alternative_user_movie_list) print 'number of gained active learning user-movie data=', count compute_error.computer_error()
def experiment2_user_qualification(active_learning_ratio=0.1): print '\n===================================' print 'experiment2 choosing qualified active learning data after classification started' print '===================================' """the test data is still in movieDataBase.core. Next is the step 1 """ print '\n===================================' print 'step 1 binary classification started' print '===================================' subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " "-method mcmc -iter 1 -out Generate/prediction", shell=True) # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " # "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True) selected_data_positions = compute_classification.compute_classification(len(dataProcess.test_original_data)) print '\n===================================' print 'choosing users with low rating MSE' print '===================================' number_of_active_data = (int(dataProcess.movieDataBase.TOTAL_USERS*active_learning_ratio)-1)*240 number_of_index = len(dataProcess.test_addAllNegative_data) alternative_user_movie_list = [] for index in selected_data_positions: userId = dataProcess.test_addAllNegative_data[index][0] movieId = dataProcess.test_addAllNegative_data[index][1] alternative_user_movie_list.append([userId, movieId]) alternative_user_movie_list = sorted(alternative_user_movie_list, key=lambda x: dataProcess.movieDataBase.user_MSE[x[0]]) alternative_user_movie_list = alternative_user_movie_list[:number_of_active_data] # for x, y in alternative_user_movie_list: # print x, y # movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list) """ add active learning result into train_original_data """ print '\n===================================' print 'step active learning regression started' print '===================================' dataProcess.movieDataBase.make_user_movie_rating_matrix(dataProcess.test_original_data) active_learning_train_data = [] # this scheme is adding every thing in active learning count = 0 for values in alternative_user_movie_list: userId = values[0] movieId = values[1] ratings = dataProcess.movieDataBase.user_movie_rating_matrix.get(userId) if ratings is not None: rating = ratings.get(movieId) if rating is not None: active_learning_train_data.append([userId, movieId, rating]) count += 1 train_add_active_learning_data = dataProcess.train_original_data + active_learning_train_data dataProcess.movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data') dataProcess.movieDataBase.generate_libfm_data(train_add_active_learning_data) dataProcess.movieDataBase.store_data_to_file(dataProcess.movieDataBase.libfm_data, fileName='train_step3.libfm') # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " # "-method mcmc -out Generate/prediction", shell=True) # compute_error.computer_error() subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 50 -out Generate/prediction", shell=True) print 'number of alternative user-movie requests=', len(alternative_user_movie_list) print 'number of gained active learning user-movie data=', count compute_error.computer_error()
def baseline1_random(active_learning_ratio=0.1): movieDataBase = MovieDataBase() movieDataBase.generate_complete_rating_data(regenerate=False) movieDataBase.make_sorted_rating_data([5, 1, 0]) sorted_rating_data = movieDataBase.out_rating_data[:] movieDataBase.synchronize() movieDataBase.make_slice(count_movie_slice=range(1, 1201)) movieDataBase.synchronize() train_original_data = movieDataBase.core_rating_data[:] movieDataBase.store_data_to_file(fileName='train_original_data') movieDataBase.generate_libfm_data(omitMovie=True) movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'train_step2.libfm') movieDataBase.add_negative_data() movieDataBase.synchronize() train_addNegative_data = movieDataBase.core_rating_data[:] movieDataBase.store_data_to_file(fileName='train_addNegative_data') movieDataBase.generate_libfm_data(omitMovie=True) movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'train_step1.libfm') movieDataBase.load_core_rating_data(sorted_rating_data) movieDataBase.make_slice(count_movie_slice=range(1201, 1441)) movieDataBase.synchronize() test_original_data = movieDataBase.core_rating_data[:] movieDataBase.store_data_to_file(fileName='test_original_data') movieDataBase.generate_libfm_data(shuffle=False) movieDataBase.store_data_to_file(movieDataBase.libfm_data, fileName='test_step3.libfm') movieDataBase.add_negative_data(addAllUsers=True) movieDataBase.synchronize() test_addAllNegative_data = movieDataBase.core_rating_data[:] movieDataBase.store_data_to_file(fileName='test_addAllNegative_data') movieDataBase.generate_libfm_data(omitMovie=True, shuffle=False) movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'test_step1.libfm') movieDataBase.store_data_to_file(movieDataBase.libfm_data, 'test_step2.libfm') print '\n===================================' print 'baseline1 with random choosing active learning data started' print '===================================' """ Choose active learning data randomly """ number_of_index = len(test_addAllNegative_data) print number_of_index*active_learning_ratio invited_user_movies = [0] * (movieDataBase.TOTAL_USERS+1) selected_data_positions = set() while len(selected_data_positions) < number_of_index*active_learning_ratio: new_index = random.randint(0, number_of_index-1) if invited_user_movies[movieDataBase.core_rating_data[new_index][0]] < MAX_INVITATION: selected_data_positions.add(new_index) invited_user_movies[movieDataBase.core_rating_data[new_index][0]] += 1 alternative_user_movie_list = [] for index in selected_data_positions: userId = movieDataBase.core_rating_data[index][0] movieId = movieDataBase.core_rating_data[index][1] alternative_user_movie_list.append([userId, movieId]) movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list) """ add active learning result into train_original_data """ print '\n===================================' print 'active learning regression started' print '===================================' movieDataBase.make_user_movie_rating_matrix(test_original_data) active_learning_train_data = [] # this scheme is adding every thing in active learning count = 0 for values in alternative_user_movie_list: userId = values[0] movieId = values[1] ratings = movieDataBase.user_movie_rating_matrix.get(userId) if ratings is not None: rating = ratings.get(movieId) if rating is not None: active_learning_train_data.append([userId, movieId, rating]) count += 1 train_add_active_learning_data = train_original_data + active_learning_train_data movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data') movieDataBase.generate_libfm_data(train_add_active_learning_data) movieDataBase.store_data_to_file(movieDataBase.libfm_data, fileName='train_step3.libfm') # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " # "-method mcmc -out Generate/prediction", shell=True) # compute_error.computer_error() subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 10 -out Generate/prediction", shell=True) print 'number of alternative user-movie requests=', len(alternative_user_movie_list) print 'number of gained active learning user-movie data=', count compute_error.computer_error()
def experiment3_user_similarity(active_learning_ratio=0.1): print '\n===================================' print 'experiment3 choosing special active learning users after classification started' print '===================================' """the test data is still in movieDataBase.core. Next is the step 1 """ print '\n===================================' print 'step 1 binary classification started' print '===================================' subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " "-method mcmc -iter 1 -out Generate/prediction", shell=True) # subprocess.call("./Generate/libFM -task c -train Generate/train_step1.libfm -test Generate/test_step1.libfm " # "-method sgd -learn_rate 0.01 -out Generate/prediction", shell=True) selected_data_positions = compute_classification.compute_classification(len(dataProcess.test_original_data)) positive_user_id_movie_id = numpy.zeros( [dataProcess.movieDataBase.TOTAL_USERS, dataProcess.TEST_MOVIES], dtype=numpy.bool_) for index in selected_data_positions: userId = dataProcess.test_addAllNegative_data[index][0] movieId = dataProcess.test_addAllNegative_data[index][1] positive_user_id_movie_id[userId-1][dataProcess.test_from_movie_id_get_movie_num[movieId]-1] = numpy.True_ print '\n===================================' print 'step2: regression started' print '===================================' subprocess.call("./Generate/libFM -task r -train Generate/train_step2.libfm -test Generate/test_step2.libfm " "-method mcmc -iter 1 -out Generate/prediction", shell=True) f1 = open('Generate/prediction') step2_pred_rating = f1.readlines() f1.close() step2_pred_rating = [float(x.strip()) for x in step2_pred_rating] step2_pred_data = [] for i in range(len(dataProcess.test_addAllNegative_data)): step2_pred_data.append([dataProcess.test_addAllNegative_data[i][0], dataProcess.test_addAllNegative_data[i][1], step2_pred_rating[i]]) dataProcess.movieDataBase.compute_numpy_matrix(step2_pred_data) pred_rating_numpy_matrix = dataProcess.movieDataBase.numpy_rating_matrix pred_from_movie_id_get_movie_num = dataProcess.movieDataBase.from_movie_id_get_movie_num # print pred_rating_numpy_matrix[10][pred_from_movie_id_get_movie_num[743]-1] dictionary = dict() dictionary['train_rating'] = dataProcess.train_rating_numpy_matrix dictionary['predict_rating'] = pred_rating_numpy_matrix dictionary['positive_rating'] = positive_user_id_movie_id scipy.io.savemat('step2.mat', dictionary) """next we have to use Matlab""" return print '\n===================================' print 'choosing users with special similarity' print '===================================' number_of_index = len(test_addAllNegative_data) alternative_user_movie_list = [] alternative_user_movie_list.append([userId, movieId]) alternative_user_movie_list_sorted = sorted(alternative_user_movie_list, key=lambda x: x[1]) # for x, y in alternative_user_movie_list_sorted: # print x, y # alternative_user_movie_list = [] # invited_user_movies = [0] * (movieDataBase.TOTAL_USERS+1) # for userId, movieId in alternative_user_movie_list_sorted: # if invited_user_movies[userId] < MAX_INVITATION: # invited_user_movies[userId] += 1 # alternative_user_movie_list.append([userId, movieId]) # if len(alternative_user_movie_list) >= int(number_of_index*active_learning_ratio): # break # for x, y in alternative_user_movie_list: # print x, y movieDataBase.make_alternative_user_movie_matrix(alternative_user_movie_list) """ add active learning result into train_original_data """ print '\n===================================' print 'step active learning regression started' print '===================================' movieDataBase.make_user_movie_rating_matrix(test_original_data) active_learning_train_data = [] # this scheme is adding every thing in active learning count = 0 for values in alternative_user_movie_list: userId = values[0] movieId = values[1] ratings = movieDataBase.user_movie_rating_matrix.get(userId) if ratings is not None: rating = ratings.get(movieId) if rating is not None: active_learning_train_data.append([userId, movieId, rating]) count += 1 train_add_active_learning_data = train_original_data + active_learning_train_data movieDataBase.store_data_to_file(train_add_active_learning_data, fileName='train_add_active_learning_data') movieDataBase.generate_libfm_data(train_add_active_learning_data) movieDataBase.store_data_to_file(movieDataBase.libfm_data, fileName='train_step3.libfm') # subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " # "-method mcmc -out Generate/prediction", shell=True) # compute_error.computer_error() subprocess.call("./Generate/libFM -task r -train Generate/train_step3.libfm -test Generate/test_step3.libfm " "-method sgd -dim '1,1, 200' -learn_rate 0.001 -iter 10 -out Generate/prediction", shell=True) print 'number of alternative user-movie requests=', len(alternative_user_movie_list) print 'number of gained active learning user-movie data=', count compute_error.computer_error()