def __init__(self, *args, **kwargs): super(TestTimebinSimilarity, self).__init__(*args, **kwargs) self.dataset = MovielensDataset( ratings_file_path= r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\ratings.csv', movies_file_path= r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\movies.csv') self.optimized_dataset = DatasetOptimizer(self.dataset) self.dataset_user_operator = DatasetUserOperator( self.optimized_dataset.get_ratings())
def __init__(self, *args, **kwargs): super(TestPrediction, self).__init__(*args, **kwargs) self.dataset = MovielensDataset( ratings_file_path=r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\ratings.csv', movies_file_path=r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\movies.csv') self.optimized_dataset = DatasetOptimizer(self.dataset) self.pearson_similarity = OptimizedPearsonSimilarity(self.optimized_dataset, 3)
def test_get_movie_record(self): dataset = MovielensDataset( ratings_file_path=r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\ratings.csv', movies_file_path=r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\movies.csv') movie_operator = DatasetMovieOperator(DatasetOptimizer(dataset)); self.assertTrue(movie_operator.get_movie_record(9999999999).empty) self.assertFalse(movie_operator.get_movie_record(3).empty) self.assertTrue(movie_operator.get_movie_record(-1).empty)
def __init__(self, *args, **kwargs): super(TestMutualInformation, self).__init__(*args, **kwargs) self.dataset = MovielensDataset( ratings_file_path= r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\ratings.csv', movies_file_path= r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\movies.csv') self.optimized_dataset = DatasetOptimizer(self.dataset)
class TestTimebinSimilarity(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestTimebinSimilarity, self).__init__(*args, **kwargs) self.dataset = MovielensDataset( ratings_file_path= r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\ratings.csv', movies_file_path= r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\movies.csv') self.optimized_dataset = DatasetOptimizer(self.dataset) self.dataset_user_operator = DatasetUserOperator( self.optimized_dataset.get_ratings()) def test_static_timebin_similarity(self): #print(self.optimized_dataset.get_ratings()) ratings = self.optimized_dataset.get_ratings() #print(ratings.reset_index()) static_timebin_similarity = TimebinSimilarity(self.optimized_dataset) timebin = Timebin(self.dataset_user_operator, 448, 0, 80) print(static_timebin_similarity.get_timebin_neighbours(timebin, 3)) self.assertEqual(False, False)
def test_movie_based_neighbourhood(self): dataset = MovielensDataset( ratings_file_path= r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\ratings.csv', movies_file_path= r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\movies.csv') pearson_similarity = OptimizedPearsonSimilarity( DatasetOptimizer(dataset), 3) knn = KNearestNeighbours(pearson_similarity, 20) self.assertTrue( len(knn.get_common_movie_based_k_nearest_neighbours(448, 3)) > 0)
def __init__(self, optimized_dataset: DatasetOptimizer, k=20, min_neighbour_timebin_size=5, max_neighbour_timebin_size=50, neighbour_timebin_size_increment=5, min_common_between_users=3): self.__timebin_similarity = TimebinSimilarity( optimized_dataset, min_neighbour_timebin_size, max_neighbour_timebin_size, neighbour_timebin_size_increment, min_common_between_users) super().__init__(self.__timebin_similarity, k) self.__optimized_dataset = optimized_dataset self.__dataset_user_operator = DatasetUserOperator( optimized_dataset.get_ratings()) self.__k = k
def test_clean_optimizer(self): dataset_optimizer = DatasetOptimizer(self.dataset) self.assertTrue(dataset_optimizer.__movies.empty, True) self.assertTrue(dataset_optimizer.__ratings.empty, True) self.assertTrue(dataset_optimizer.__movie_ratings.empty, True) dataset_optimizer.get_movies() dataset_optimizer.clean() self.assertTrue(dataset_optimizer.__movies.empty, True) dataset_optimizer.get_ratings() dataset_optimizer.clean() self.assertTrue(dataset_optimizer.__ratings.empty, True) dataset_optimizer.get_movie_ratings() dataset_optimizer.clean() self.assertTrue(dataset_optimizer.__movie_ratings.empty, True)
def assert_valid_ratings(self, optimized_dataset: DatasetOptimizer): ratings = optimized_dataset.get_ratings() self.assertEqual(len(ratings) > 0, True)
def assert_valid_movies(self, optimized_dataset: DatasetOptimizer): self.assertTrue(len(optimized_dataset.get_movies()) > 0, True)
def test_load_ratings(self): movielens_dataset = self.dataset self.assertRaises(InvalidDatasetOptimizerInput, DatasetOptimizer, None) self.assert_valid_ratings(DatasetOptimizer(movielens_dataset)) self.assert_valid_movies(DatasetOptimizer(movielens_dataset))
class TestTimebinPrediction(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestTimebinPrediction, self).__init__(*args, **kwargs) self.dataset = MovielensDataset( ratings_file_path= r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\ratings.csv', movies_file_path= r'C:\Users\Yukawa\PycharmProjects\ProjectAlpha\data\movie_datasets\ml-latest-small' r'\movies.csv') self.optimized_dataset = DatasetOptimizer(self.dataset) self.dataset_user_operator = DatasetUserOperator( self.optimized_dataset.get_ratings()) def test_prediction(self): static_timebin_prediction = StaticTimebinPrediction( self.optimized_dataset) static_timebin_prediction.predict(448, 3) @staticmethod def __calculate_first_derivatives(data): first_derivatives = [0] * len(data) for i in range(1, len(data)): first_derivatives[i] = data[i][1] - data[i - 1][1] return first_derivatives @staticmethod def __calculate_second_derivatives(fist_derivatives): second_derivatives = [0] * len(fist_derivatives) for i in range(1, len(fist_derivatives)): second_derivatives[i] = fist_derivatives[i] - fist_derivatives[i - 1] return second_derivatives def test_dynamic_performance_using_derivatives(self): data = [(3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 2.25), (16, 0.25), (17, 0), (18, 0), (19, 0), (20, 0.25), (21, 0), (22, 0), (23, 0.25), (24, 0), (25, 0.25), (26, 0), (27, 0), (28, 0.25), (29, 0.25), (30, 0), (31, 1.0), (32, 2.25), (33, 0), (34, 0.001), (35, 0.25), (36, 0.25), (37, 2.25), (38, 0.25), (39, 0), (40, 2.25), (41, 0), (42, 2.25)] print(data) first_derivatives = [0] * len(data) for i in range(1, len(data)): first_derivatives[i] = data[i][1] - data[i - 1][1] print(first_derivatives) second_derivatives = [0] * len(data) for i in range(1, len(data)): second_derivatives[i] = first_derivatives[i] - first_derivatives[ i - 1] print(second_derivatives) for i in range(len(data)): print(f"{data[i]} {first_derivatives[i]} {second_derivatives[i]}") def test_best_dynamic_size(self): data = [(3, 0), (4, 0), (5, 0), (6, 0), (7, 0), (8, 0), (9, 0), (10, 0), (11, 0), (12, 0), (13, 0), (14, 0), (15, 2.25), (16, 0.25), (17, 0), (18, 0), (19, 0), (20, 0.25), (21, 0), (22, 0), (23, 0.25), (24, 0), (25, 0.25), (26, 0), (27, 0), (28, 0.25), (29, 0.25), (30, 0), (31, 1.0), (32, 2.25), (33, 0), (34, 0.001), (35, 0.25), (36, 0.25), (37, 2.25), (38, 0.25), (39, 0), (40, 2.25), (41, 0), (42, 2.25)] non_zero_rmse_pairs = list() for i in range(len(data)): if 0.1 < data[i][1] < 1: non_zero_rmse_pairs.append(data[i]) print(non_zero_rmse_pairs) # take inverse weighted average dynamic_timebin_size = 43 product_sum = 0 weights_sum = 0 for i in range(len(non_zero_rmse_pairs)): weight = 1 / non_zero_rmse_pairs[i][1] product_sum += non_zero_rmse_pairs[i][0] * weight weights_sum += weight dynamic_timebin_size = product_sum / weights_sum closest_timebin_size = -1 closest_diff = float('inf') for i in range(len(non_zero_rmse_pairs)): diff = abs(non_zero_rmse_pairs[i][0] - dynamic_timebin_size) if diff < closest_diff: closest_timebin_size = non_zero_rmse_pairs[i][0] closest_diff = diff dynamic_timebin_size = closest_timebin_size if closest_timebin_size < dynamic_timebin_size else dynamic_timebin_size print(dynamic_timebin_size) def test_dynamic_timebin_size(self): user_id = 448 movie_id = 3 n_ratings = len( self.dataset_user_operator.get_user_rating_history(user_id)) rmse_per_timebinsize = list() for timebin_size in range(3, 43, 1): static_timebin_prediction = StaticTimebinPrediction( self.optimized_dataset, global_timebin_size=timebin_size) predictions = list() for random_movie_id in self.dataset_user_operator.get_user_random_movie_list_from_history( user_id, n_movies=10): actual = self.dataset_user_operator.get_user_rating_value( user_id, random_movie_id) prediction = static_timebin_prediction.predict( user_id, random_movie_id) if prediction != 0: print( f"Movie_id: {random_movie_id} \tPrediction: {prediction} \tActual: {actual}" ) predictions.append((prediction, actual)) rmse = Accuracy.rmse(predictions) rmse_per_timebinsize.append((timebin_size, rmse)) print(rmse_per_timebinsize)