예제 #1
0
    def compute_ranking_matrix(self, X, Y):
        """
        This function compute the ranking matrix with no changes made.
        Args:
            - X: pandas dataframe, created as "user", "item"
            - Y:pandas dataframe, with the corresponding "ratings"

        """

        # Useful variables
        unique_items = utils.unique_movies(X)
        unique_users = utils.unique_users(X)
        size = utils.total_ratings(X)

        # Setting new shape for knn model
        self.unique_items = unique_items
        self.unique_users = unique_users

        # Creating the ranking matrix
        M_bool = np.full((self.unique_users, self.unique_items), False)
        M_values = np.full((self.unique_users, self.unique_items), 0.0)

        for index in range(size):
            user = utils.get_user_at(index, X) - 1
            item = utils.get_item_at(index, X) - 1
            rating = Y[index]
            M_bool[user][item] = True
            M_values[user][item] = float(rating)

        self.matrix_bool = M_bool if self.user_based == True else np.transpose(
            M_bool)
        self.matrix_values = M_values if self.user_based == True else np.transpose(
            M_values)
        self.size = size
예제 #2
0
 def score(self, X, y):
     """
     This function returns the score obtained at the end of training
     Args:
         - self: MF object, should have called fit method earlier
         - X: pandas DataFrame, with pairs of item-user with ratings
         - y: pandas Series, contains the ratings associated with pairs in X
     Output:
         - score: float, the score on the training data (- RMSE)
     """
     size = utils.total_ratings(X)
     prediction = self.predict(X)
     err = np.sum((y - prediction)**2)
     score = -np.sqrt(err / size)
     return score
예제 #3
0
 def score(self, X, y):
     """
     This function returns the score obtained at the end of training
     Args:
         - self: MF object, should have called fit method earlier
         - X: panda DataFrame, with pairs of item-user with ratings
         - y: panda Series, contains the ratings associated with pairs in X
     Output:
         - score: float, the score on the training data (- RMSE)
     """
     size = utils.total_ratings(X)
     prediction = self.predict(X)
     err = 0
     real_size = 0
     for i in range(size):
         if (prediction[i] == prediction[i]):  # check if NaN
             err += (prediction[i] - y[i])**2
             real_size += 1
     score = -np.sqrt(err / real_size)
     return score
예제 #4
0
    def fit(self, X, y):
        """
        This methods fits the algorithm to the data
        Args:
            - self: MF object, to be fitted
            - X: panda DataFrame, with hashed available pairs of item-user
            - y: panda Series, with corresponding ratings to the
            pairs of item-user in X
        """

        # Useful variables
        nb_users = self.unique_users
        nb_items = self.unique_items
        size = utils.total_ratings(X)

        # Initialize the latent factor matrices
        U = np.random.rand(nb_users, self.n_factors)
        V = np.random.rand(nb_items, self.n_factors)

        # initialize the bias vectors
        Bu = np.zeros(nb_users)
        Bi = np.zeros(nb_items)

        # Initialize errors
        err = np.zeros(size)

        # Aliases
        lr = self.learning_rate
        lrb = self.learning_rate_bias
        l = self.lambd

        # Compute global mean
        global_mean = np.mean(y)

        # Initialize the visited users and items
        known_u = np.full(nb_users, False, dtype=bool)
        known_i = np.full(nb_items, False, dtype=bool)

        # Run SGD to update the factors
        for epoch in range(self.n_epochs):

            if self.verbose and (epoch == 0 or (epoch + 1) % 10 == 0):
                print("Starting epoch {}".format(epoch + 1))

            # randomly shuffling training data
            random_indices = np.random.permutation(size)

            for index in random_indices:

                # -1 because hashing created users and items with integer
                # values starting at 1
                user = utils.get_user_at(index, X) - 1
                item = utils.get_item_at(index, X) - 1

                known_u[user] = True
                known_i[item] = True

                # after experimentation with several methods to comput the dot
                # product we found the following to be the fastest
                dot_prod = np.dot(V[item, ], U[user, ])

                biases = Bu[user] + Bi[item]

                err[index] = y[index] - (global_mean + biases + dot_prod)

                # Update the biases
                Bu[user] += lrb * err[index]
                Bi[item] += lrb * err[index]

                # simultaneous update
                old_u = np.copy(U[user, ])
                for f in range(self.n_factors):
                    U[user,
                      f] += lr * (err[index] * V[item, f] - l * U[user, f])
                    V[item, f] += lr * (err[index] * old_u[f] - l * V[item, f])

        # store the results
        self.latent_users_ = U
        self.latent_items_ = V
        self.bias_user_ = Bu
        self.bias_item_ = Bi

        # store the training mean
        self.global_mean_ = global_mean

        # store visited items and users
        self.known_users_ = known_u
        self.known_items_ = known_i

        # compute the score: -RMSE
        self.score_ = -np.sqrt(np.sum(err**2) / size)

        return self
# initialize useful variables
full_data = False
small_data = False

random_state = 0

# read in the data
if (small_data):
    data = preprocessing.load_data("ratings_small.csv")
elif (full_data):
    data = preprocessing.load_data("ratings.csv")
else:
    data = preprocessing.sample_data("ratings.csv", random_state=random_state)

# compute the total number of ratings
tot_ratings = utils.total_ratings(data)

# compute the number of users
tot_users = utils.unique_users(data)

# compute the number of items
tot_items = utils.unique_movies(data)

# hash the data
hashed_data = utils.hash_data(data)

# split into train and test sets
if (full_data):
    train, test = preprocessing.custom_full_train_test_split(
        hashed_data, random_state=random_state)
elif (small_data):
예제 #6
0
    def fit(self, X, y):
        """
        This methods fits the algorithm to the data
        Args:
            - self: MF object, to be fitted
            - X: pandas DataFrame, with hashed available pairs of item-user.
            The hashing is necessary to make sure items and users are
            numbered from 0 to unique_users/items, from practical purposes
            - y: pandas Series, with corresponding ratings to the pairs of
            item-user in X
        """

        # useful variables
        nb_users = self.unique_users
        nb_items = self.unique_items
        size = utils.total_ratings(X)

        # initialize the latent factor matrices
        U = np.random.rand(nb_users, self.n_factors)
        V = np.random.rand(nb_items, self.n_factors)

        # initialize the bias vectors
        Bu = np.zeros(nb_users)
        Bi = np.zeros(nb_items)

        # initialize errors
        err = np.zeros(size)

        # aliases
        lr = self.learning_rate
        lrb = self.learning_rate_bias
        l = self.lambd

        # compute global mean
        global_mean = np.mean(y)

        # initialize the visited users and items
        known_u = np.full(nb_users, False, dtype=bool)
        known_i = np.full(nb_items, False, dtype=bool)

        # run SGD to update the factors
        for epoch in range(self.n_epochs):

            if self.verbose and (epoch == 0 or (epoch + 1) % 10 == 0):
                print("Starting epoch {}".format(epoch + 1))

            # randomly shuffling training data
            random_indices = np.random.permutation(size)

            for index in random_indices:

                # retrieve user and item at the given index
                user = int(utils.get_user_at(index, X))
                item = int(utils.get_item_at(index, X))

                # mark user and items as visited
                known_u[user] = True
                known_i[item] = True

                # compute factor dot products
                dot_prod = np.dot(V[item, ], U[user, ])

                # compute bias contribution to rating
                biases = Bu[user] + Bi[item]

                # compute error for this rating
                err[index] = y[index] - (global_mean + biases + dot_prod)

                # update the biases
                Bu[user] += lrb * err[index]
                Bi[item] += lrb * err[index]

                # simultaneous update of the factors
                old_u = np.copy(U[user, ])
                for f in range(self.n_factors):
                    U[user,
                      f] += lr * (err[index] * V[item, f] - l * U[user, f])
                    V[item, f] += lr * (err[index] * old_u[f] - l * V[item, f])

        # store the learned results
        self.latent_users_ = U
        self.latent_items_ = V
        self.bias_user_ = Bu
        self.bias_item_ = Bi

        # store the training mean
        self.global_mean_ = global_mean

        # store visited items and users
        self.known_users_ = known_u
        self.known_items_ = known_i

        return self