def test_dataframe_separation(self): """ Test to check if the class SVDmodel is separating the dataframe in train, test and valid dataframes with the right proportions. """ path = parent_path + '/ml-1m/ratings.dat' df = dfFunctions.load_dataframe(path) model = re.SVDmodel(df, 'User', 'item', 'Rating') sum_of_sizes = len(model.train) + len(model.test) + len(model.valid) proportion_train = len(model.train) / len(df) proportion_test = len(model.test) / len(df) proportion_valid = len(model.valid) / len(df) right_proportions = np.array([0.8, 0.1, 0.1]) proportions = np.array([proportion_train, proportion_test, proportion_valid]) error = rmse(proportions, right_proportions) self.assertTrue(len(df) == sum_of_sizes) self.assertTrue(error < 0.1, '''/n The right proportions are (train,test,valid) = {0}, but the model is separating the dataframe with the proportions {1}''' .format(right_proportions, proportions))
def test_upperbound(self): """ We run 5000 steps of training and check if the root mean square error from the valid dataset is less than 1.0 in the NSVD model """ path = parent_path + '/ml-1m/ratings.dat' df = dfFunctions.load_dataframe(path) model = re.SVDmodel(df, 'User', 'item', 'Rating', 'nsvd', 'mean') dimension = 12 regularizer_constant = 0.05 learning_rate = 0.001 momentum_factor = 0.9 batch_size = 1000 num_steps = 5000 print('\n') model.training(dimension, regularizer_constant, learning_rate, momentum_factor, batch_size, num_steps) prediction = model.valid_prediction() self.assertTrue(prediction <= 1.0, '''/n with num_steps = {0} /n, the mean square error of the valid dataset should be less than 1 and not {1}''' .format(num_steps, prediction))
def test_batch(self): """ Test to check if the batchgenerator class is creating different batches of the same size. """ path = parent_path + '/ml-1m/ratings.dat' df = dfFunctions.load_dataframe(path) batch_size = 100 generator = dfFunctions.BatchGenerator(df, batch_size, 'User', 'item', 'Rating') old_observation = None count = 0 num_of_tests = 200 for i in range(num_of_tests): batch = generator.get_batch() current_observation = (batch[0][0], batch[1][0], batch[2][0]) if current_observation == old_observation: count += 1 old_observation = current_observation self.assertTrue(len(batch[0]) == batch_size) self.assertTrue(len(batch[1]) == batch_size) self.assertTrue(len(batch[2]) == batch_size) self.assertTrue(count < num_of_tests)
def test_load_dataframe(self): """ Test to check if the function load_dataframe is working with all the datasets from movielens. """ path1 = parent_path + '/ml-1m/ratings.dat' path10 = parent_path + '/movielens/ml-10m/ratings.dat' # Should not be used path20 = parent_path + '/movielens/ml-20m/ratings.csv' # Should not be used df1 = dfFunctions.load_dataframe(path1) # df10 = dfFunctions.load_dataframe(path10) # df20 = dfFunctions.load_dataframe(path20) self.assertTrue(type(df1) == pd.core.frame.DataFrame)
def test_rated_items(self): """ Test to check if the method _set_item_dic creates a dic with User:rated_items such that len(rated_items) == max_size. """ path = parent_path + '/ml-1m/ratings.dat' df = dfFunctions.load_dataframe(path) finder = dfFunctions.ItemFinder(df, 'User', 'item', 'Rating', 'mean') all_users = df['User'].unique() count = 0 problem_users = [] for user in all_users: r_items = finder.dic[user] if len(r_items) == finder.size and r_items.dtype == 'int32': count += 1 else: problem_users.append(user) self.assertTrue(count == len(all_users), '''/n There are {0} arrays in dic such that len(dic[User]) != finder.size or with wrong types. And these users are {1}'''.format(count, problem_users))
def test_dataframe_intersection(self): """ Test to check if the train, test and valid dataframes have no intersection between them. """ path = parent_path + '/ml-1m/ratings.dat' df = dfFunctions.load_dataframe(path) model = re.SVDmodel(df, 'User', 'item', 'Rating') dic_intersection = dfFunctions.count_intersection(model.train, model.test, model.valid) self.assertTrue(dic_intersection['1-2'] == 0, '''/n The intersection between the train and test dataframe is {0}'''.format(dic_intersection['1-2'])) self.assertTrue(dic_intersection['1-3'] == 0, '''/n The intersection between the train and valid dataframe is {0}'''.format(dic_intersection['1-3'])) self.assertTrue(dic_intersection['2-3'] == 0, '''/n The intersection between the test and valid dataframe is {0}'''.format(dic_intersection['2-3']))
def test_prediction(self): """ We run 5000 steps of training and check if the difference between the prediction mean and the actual mean is less than 0.9 in the SVD model """ path = parent_path + '/ml-1m/ratings.dat' df = dfFunctions.load_dataframe(path) model = re.SVDmodel(df, 'User', 'item', 'Rating', 'nsvd', 'mean') dimension = 12 regularizer_constant = 0.05 learning_rate = 0.001 momentum_factor = 0.9 batch_size = 1000 num_steps = 5000 print('/n') model.training(dimension, regularizer_constant, learning_rate, momentum_factor, batch_size, num_steps) user_example = np.array(model.valid['User'])[0:10] movies_example = np.array(model.valid['item'])[0:10] actual_ratings = np.mean(np.array(model.valid['Rating'])[0:10]) predicted_ratings = np.mean(model.prediction(user_example, movies_example)) difference = np.absolute(actual_ratings - predicted_ratings) self.assertTrue(difference <= 0.9, '''/n with num_steps = {0} /n, the difference should be less than 0.9 and not {1}''' .format(num_steps, difference))
def initialize(): # path = parent_path + '\\ml-1m\\ratings.dat' filename = os.path.join( root_dir, 'machine_learning/prediction/new_algorithm/ml-1m/ratings.dat') path = os.path.abspath(os.path.realpath(filename)) parser = argparse.ArgumentParser() parser.add_argument('-p', '--path', type=str, default=path, help=('ratings path\n' '(default=pwd/movielens/ml-1m/ratings.dat)')) parser.add_argument('-e', '--example', type=str, default='1', help=('movielens dataset\n' 'examples (only 1, 10 or 20) (default=1)')) parser.add_argument('-b', '--batch', type=int, default=700, help='batch size (default=700)') parser.add_argument('-s', '--steps', type=int, default=2000, help='number of training steps (default=7000)') parser.add_argument('-d', '--dimension', type=int, default=12, help='embedding vector size (default=12)') parser.add_argument('-r', '--reg', type=float, default=0.0003, help=('regularizer constant for\n' 'the loss function (default=0.0003)')) parser.add_argument('-l', '--learning', type=float, default=0.001, help='learning rate (default=0.001)') parser.add_argument('-m', '--momentum', type=float, default=0.926, help='momentum factor (default=0.926)') parser.add_argument('-i', '--info', type=str, default='True', help=('Training information.\n' 'Only True or False (default=True)')) parser.add_argument('-M', '--model', type=str, default='svd', help='models: either svd or nsvd (default=svd)') parser.add_argument('-S', '--nsvd_size', type=str, default='mean', help=('size of the vectors of the nsvd model:\n' 'either max, mean or min (default=mean)')) args = parser.parse_args() if args.example == '20': path = parent_path + '/movielens/ml-20m/ratings.csv' elif args.example == '10': path = parent_path + '/movielens/ml-10m/ratings.dat' elif args.example == '1': pass else: print('Wrong parameter passed to the example option. ' 'Running default=1\n') df = dfFunctions.load_dataframe(args.path) if args.model == 'svd': model = re.SVDmodel(df, 'User', 'item', 'Rating') else: model = re.SVDmodel(df, 'User', 'item', 'Rating', args.model, args.nsvd_size) dimension = args.dimension regularizer_constant = args.reg learning_rate = args.learning batch_size = args.batch num_steps = args.steps momentum_factor = args.momentum if args.info == 'True': info = True else: info = False model.training(dimension, regularizer_constant, learning_rate, momentum_factor, batch_size, num_steps, info) prediction = model.valid_prediction() print('\nThe mean square error of the whole valid dataset is ', prediction) # user_example = np.array(model.valid['User'])[0:10] # movies_example = np.array(model.valid['item'])[0:10] # actual_ratings = np.array(model.valid['Rating'])[0:10] # predicted_ratings = model.prediction(user_example, movies_example) # print('''\nUsing our model for 10 specific users and 10 # movies we predicted the following score:''') # print(predicted_ratings) # print('\nAnd in reality the scores are:') # print(actual_ratings) return model