def compute_ranking_matrix(self, X, Y): """ This function compute the ranking matrix with no changes made. Args: - X: pandas dataframe, created as "user", "item" - Y:pandas dataframe, with the corresponding "ratings" """ # Useful variables unique_items = utils.unique_movies(X) unique_users = utils.unique_users(X) size = utils.total_ratings(X) # Setting new shape for knn model self.unique_items = unique_items self.unique_users = unique_users # Creating the ranking matrix M_bool = np.full((self.unique_users, self.unique_items), False) M_values = np.full((self.unique_users, self.unique_items), 0.0) for index in range(size): user = utils.get_user_at(index, X) - 1 item = utils.get_item_at(index, X) - 1 rating = Y[index] M_bool[user][item] = True M_values[user][item] = float(rating) self.matrix_bool = M_bool if self.user_based == True else np.transpose( M_bool) self.matrix_values = M_values if self.user_based == True else np.transpose( M_values) self.size = size
def score(self, X, y): """ This function returns the score obtained at the end of training Args: - self: MF object, should have called fit method earlier - X: pandas DataFrame, with pairs of item-user with ratings - y: pandas Series, contains the ratings associated with pairs in X Output: - score: float, the score on the training data (- RMSE) """ size = utils.total_ratings(X) prediction = self.predict(X) err = np.sum((y - prediction)**2) score = -np.sqrt(err / size) return score
def score(self, X, y): """ This function returns the score obtained at the end of training Args: - self: MF object, should have called fit method earlier - X: panda DataFrame, with pairs of item-user with ratings - y: panda Series, contains the ratings associated with pairs in X Output: - score: float, the score on the training data (- RMSE) """ size = utils.total_ratings(X) prediction = self.predict(X) err = 0 real_size = 0 for i in range(size): if (prediction[i] == prediction[i]): # check if NaN err += (prediction[i] - y[i])**2 real_size += 1 score = -np.sqrt(err / real_size) return score
def fit(self, X, y): """ This methods fits the algorithm to the data Args: - self: MF object, to be fitted - X: panda DataFrame, with hashed available pairs of item-user - y: panda Series, with corresponding ratings to the pairs of item-user in X """ # Useful variables nb_users = self.unique_users nb_items = self.unique_items size = utils.total_ratings(X) # Initialize the latent factor matrices U = np.random.rand(nb_users, self.n_factors) V = np.random.rand(nb_items, self.n_factors) # initialize the bias vectors Bu = np.zeros(nb_users) Bi = np.zeros(nb_items) # Initialize errors err = np.zeros(size) # Aliases lr = self.learning_rate lrb = self.learning_rate_bias l = self.lambd # Compute global mean global_mean = np.mean(y) # Initialize the visited users and items known_u = np.full(nb_users, False, dtype=bool) known_i = np.full(nb_items, False, dtype=bool) # Run SGD to update the factors for epoch in range(self.n_epochs): if self.verbose and (epoch == 0 or (epoch + 1) % 10 == 0): print("Starting epoch {}".format(epoch + 1)) # randomly shuffling training data random_indices = np.random.permutation(size) for index in random_indices: # -1 because hashing created users and items with integer # values starting at 1 user = utils.get_user_at(index, X) - 1 item = utils.get_item_at(index, X) - 1 known_u[user] = True known_i[item] = True # after experimentation with several methods to comput the dot # product we found the following to be the fastest dot_prod = np.dot(V[item, ], U[user, ]) biases = Bu[user] + Bi[item] err[index] = y[index] - (global_mean + biases + dot_prod) # Update the biases Bu[user] += lrb * err[index] Bi[item] += lrb * err[index] # simultaneous update old_u = np.copy(U[user, ]) for f in range(self.n_factors): U[user, f] += lr * (err[index] * V[item, f] - l * U[user, f]) V[item, f] += lr * (err[index] * old_u[f] - l * V[item, f]) # store the results self.latent_users_ = U self.latent_items_ = V self.bias_user_ = Bu self.bias_item_ = Bi # store the training mean self.global_mean_ = global_mean # store visited items and users self.known_users_ = known_u self.known_items_ = known_i # compute the score: -RMSE self.score_ = -np.sqrt(np.sum(err**2) / size) return self
# initialize useful variables full_data = False small_data = False random_state = 0 # read in the data if (small_data): data = preprocessing.load_data("ratings_small.csv") elif (full_data): data = preprocessing.load_data("ratings.csv") else: data = preprocessing.sample_data("ratings.csv", random_state=random_state) # compute the total number of ratings tot_ratings = utils.total_ratings(data) # compute the number of users tot_users = utils.unique_users(data) # compute the number of items tot_items = utils.unique_movies(data) # hash the data hashed_data = utils.hash_data(data) # split into train and test sets if (full_data): train, test = preprocessing.custom_full_train_test_split( hashed_data, random_state=random_state) elif (small_data):
def fit(self, X, y): """ This methods fits the algorithm to the data Args: - self: MF object, to be fitted - X: pandas DataFrame, with hashed available pairs of item-user. The hashing is necessary to make sure items and users are numbered from 0 to unique_users/items, from practical purposes - y: pandas Series, with corresponding ratings to the pairs of item-user in X """ # useful variables nb_users = self.unique_users nb_items = self.unique_items size = utils.total_ratings(X) # initialize the latent factor matrices U = np.random.rand(nb_users, self.n_factors) V = np.random.rand(nb_items, self.n_factors) # initialize the bias vectors Bu = np.zeros(nb_users) Bi = np.zeros(nb_items) # initialize errors err = np.zeros(size) # aliases lr = self.learning_rate lrb = self.learning_rate_bias l = self.lambd # compute global mean global_mean = np.mean(y) # initialize the visited users and items known_u = np.full(nb_users, False, dtype=bool) known_i = np.full(nb_items, False, dtype=bool) # run SGD to update the factors for epoch in range(self.n_epochs): if self.verbose and (epoch == 0 or (epoch + 1) % 10 == 0): print("Starting epoch {}".format(epoch + 1)) # randomly shuffling training data random_indices = np.random.permutation(size) for index in random_indices: # retrieve user and item at the given index user = int(utils.get_user_at(index, X)) item = int(utils.get_item_at(index, X)) # mark user and items as visited known_u[user] = True known_i[item] = True # compute factor dot products dot_prod = np.dot(V[item, ], U[user, ]) # compute bias contribution to rating biases = Bu[user] + Bi[item] # compute error for this rating err[index] = y[index] - (global_mean + biases + dot_prod) # update the biases Bu[user] += lrb * err[index] Bi[item] += lrb * err[index] # simultaneous update of the factors old_u = np.copy(U[user, ]) for f in range(self.n_factors): U[user, f] += lr * (err[index] * V[item, f] - l * U[user, f]) V[item, f] += lr * (err[index] * old_u[f] - l * V[item, f]) # store the learned results self.latent_users_ = U self.latent_items_ = V self.bias_user_ = Bu self.bias_item_ = Bi # store the training mean self.global_mean_ = global_mean # store visited items and users self.known_users_ = known_u self.known_items_ = known_i return self