class TagMovieRatingTensor(object):
    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = self.conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.max_ratings = 5
        self.ordered_ratings = [0, 1, 2, 3, 4, 5]
        self.ordered_movie_names = []
        self.ordered_tag_names = []
        self.print_list = [
            "\n\nFor Tags:", "\n\nFor Movies:", "\n\nFor Ratings:"
        ]
        self.util = Util()
        self.tensor = self.fetchTagMovieRatingTensor()
        self.factors = self.util.CPDecomposition(self.tensor, 5)

    def fetchTagMovieRatingTensor(self):
        """
        Create tag movie rating tensor
        :return: tensor
        """
        mltags_df = self.data_extractor.get_mltags_data()

        tag_id_list = mltags_df["tagid"]
        tag_id_count = 0
        tag_id_dict = {}
        for element in tag_id_list:
            if element in tag_id_dict.keys():
                continue
            tag_id_dict[element] = tag_id_count
            tag_id_count += 1
            name = self.util.get_tag_name_for_id(element)
            self.ordered_tag_names.append(name)

        movieid_list = mltags_df["movieid"]
        movieid_count = 0
        movieid_dict = {}
        for element in movieid_list:
            if element in movieid_dict.keys():
                continue
            movieid_dict[element] = movieid_count
            movieid_count += 1
            name = self.util.get_movie_name_for_id(element)
            self.ordered_movie_names.append(name)

        tensor = np.zeros((tag_id_count, movieid_count, self.max_ratings + 1))

        for index, row in mltags_df.iterrows():
            tagid = row["tagid"]
            movieid = row["movieid"]
            avg_movie_rating = self.util.get_average_ratings_for_movie(movieid)
            for rating in range(0, int(avg_movie_rating) + 1):
                tagid_id = tag_id_dict[tagid]
                movieid_id = movieid_dict[movieid]
                tensor[tagid_id][movieid_id][rating] = 1

        return tensor

    def print_latent_semantics(self, r):
        """
                Pretty print latent semantics
                :param r:
        """
        i = 0
        for factor in self.factors:
            print(self.print_list[i])
            latent_semantics = self.util.get_latent_semantics(
                r, factor.transpose())
            self.util.print_latent_semantics(latent_semantics,
                                             self.get_factor_names(i))
            i += 1

    def get_factor_names(self, i):
        """
                Obtain factor names
                :param i:
                :return: factor names
        """
        if i == 0:
            return self.ordered_tag_names
        elif i == 1:
            return self.ordered_movie_names
        elif i == 2:
            return self.ordered_ratings

    def get_partitions(self, no_of_partitions):
        """
                Partition factor matrices
                :param no_of_partitions:
                :return: list of groupings
        """
        i = 0
        groupings_list = []
        for factor in self.factors:
            groupings = self.util.partition_factor_matrix(
                factor, no_of_partitions, self.get_factor_names(i))
            groupings_list.append(groupings)
            i += 1

        return groupings_list

    def print_partitioned_entities(self, no_of_partitions):
        """
                Pretty print groupings
                :param no_of_partitions:
        """
        groupings_list = self.get_partitions(no_of_partitions)
        i = 0
        for groupings in groupings_list:
            print(self.print_list[i])
            self.util.print_partitioned_entities(groupings)
            i += 1
Exemplo n.º 2
0
class UserMovieRecommendation(object):
    def __init__(self, user_id):
        self.util = Util()
        self.genre_data = self.util.genre_data
        self.user_id = user_id
        self.watched_movies = self.util.get_all_movies_for_user(self.user_id)
        self.model_movies_dict = {}

    def get_movie_movie_matrix(self, model):
        """
        Finds movie_tag matrix and returns movie_movie_similarity matrix
        :param model:
        :return: movie_movie_similarity matrix
        """
        movie_latent_matrix = None
        movies = None
        if model == "LDA":
            movie_tag_data_frame = self.genre_data
            tag_df = movie_tag_data_frame.groupby(
                ['movieid'])['tag_string'].apply(list).reset_index()
            movies = tag_df.movieid.tolist()
            movies_tags_list = list(tag_df.tag_string)
            (U, Vh) = self.util.LDA(movies_tags_list,
                                    num_topics=10,
                                    num_features=len(
                                        self.genre_data.tag_string.unique()))
            movie_latent_matrix = self.util.get_doc_topic_matrix(
                U, num_docs=len(movies), num_topics=10)
        elif model == "SVD" or model == "PCA":
            movie_tag_frame = self.util.get_movie_tag_matrix()
            movie_tag_matrix = movie_tag_frame.values
            movies = list(movie_tag_frame.index.values)
            if model == "SVD":
                (U, s, Vh) = self.util.SVD(movie_tag_matrix)
                movie_latent_matrix = U[:, :10]
            else:
                (U, s, Vh) = self.util.PCA(movie_tag_matrix)
                tag_latent_matrix = U[:, :10]
                movie_latent_matrix = numpy.dot(movie_tag_matrix,
                                                tag_latent_matrix)
        elif model == "TD":
            tensor = self.fetch_movie_genre_tag_tensor()
            factors = self.util.CPDecomposition(tensor, 10)
            movies = self.genre_data["movieid"].unique()
            movies.sort()
            movie_latent_matrix = factors[0]
        elif model == "PageRank":
            movie_tag_frame = self.util.get_movie_tag_matrix()
            movie_tag_matrix = movie_tag_frame.values
            movies = list(movie_tag_frame.index.values)
            movie_latent_matrix = movie_tag_matrix
        latent_movie_matrix = movie_latent_matrix.transpose()
        movie_movie_matrix = numpy.dot(movie_latent_matrix,
                                       latent_movie_matrix)

        return movies, movie_movie_matrix

    def compute_pagerank(self):
        """
        Function to prepare data for pageRank and calling pageRank method
        :return: list of (movie,weight) tuple
        """
        (movies, movie_movie_matrix) = self.get_movie_movie_matrix("PageRank")
        seed_movies = self.watched_movies

        return self.util.compute_pagerank(seed_movies, movie_movie_matrix,
                                          movies)

    def get_recommendation(self, model):
        """
        Function to recommend movies for a given user_id based on the given model
        :param user_id:
        :param model:
        :return: list of movies for the given user as a recommendation
        """
        recommended_movies = []
        if len(self.watched_movies) == 0:
            print("THIS USER HAS NOT WATCHED ANY MOVIE.\nAborting...")
            exit(1)
        if model == "PageRank":
            recommended_dict = self.compute_pagerank()
            for movie_p, weight_p in recommended_dict:
                if len(recommended_movies) == 5:
                    break
                if movie_p not in self.watched_movies:
                    recommended_movies.append(movie_p)
        elif model == "Combination":
            return self.get_combined_recommendation()
        elif model == "SVD" or model == "PCA" or model == "LDA" or model == "TD":
            (movies, movie_movie_matrix) = self.get_movie_movie_matrix(model)
            movie_row_dict = {}
            for i in range(0, len(movies)):
                if movies[i] in self.watched_movies:
                    movie_row_dict[movies[i]] = movie_movie_matrix[i]
            distribution_list = self.util.get_distribution_count(
                self.watched_movies, 5)
            index = 0
            for movie in self.watched_movies:
                movie_row = movie_row_dict[movie]
                labelled_movie_row = dict(zip(movies, movie_row))
                num_of_movies_to_pick = distribution_list[index]
                for each in self.watched_movies:
                    del labelled_movie_row[each]
                for each in recommended_movies:
                    del labelled_movie_row[each]
                labelled_movie_row_sorted = sorted(labelled_movie_row.items(),
                                                   key=operator.itemgetter(1),
                                                   reverse=True)
                labelled_movie_row_sorted = labelled_movie_row_sorted[
                    0:num_of_movies_to_pick]
                for (m, v) in labelled_movie_row_sorted:
                    recommended_movies.append(m)
                if len(recommended_movies) == 5:
                    break
                index += 1

        return recommended_movies

    def fetch_movie_genre_tag_tensor(self):
        """
        Create Movie Genre Tag tensor
        :return: tensor
        """
        movie_list = self.genre_data["movieid"].unique()
        movie_list.sort()
        movie_count = 0
        movie_dict = {}
        for element in movie_list:
            movie_dict[element] = movie_count
            movie_count += 1

        genre_list = self.genre_data["genre"].unique()
        genre_list.sort()
        genre_count = 0
        genre_dict = {}
        for element in genre_list:
            genre_dict[element] = genre_count
            genre_count += 1

        user_df = self.genre_data[self.genre_data['movieid'].isin(
            self.watched_movies)]
        tag_list = user_df["tag_string"].unique()
        tag_list.sort()
        tag_count = 0
        tag_dict = {}
        for element in tag_list:
            tag_dict[element] = tag_count
            tag_count += 1

        tensor = numpy.zeros((movie_count, genre_count, tag_count))

        for index, row in self.genre_data.iterrows():
            movie = row["movieid"]
            genre = row["genre"]
            tag = row["tag_string"]
            if genre not in genre_list or tag not in tag_list:
                continue
            movie_id = movie_dict[movie]
            genre_name = genre_dict[genre]
            tag_name = tag_dict[tag]
            tensor[movie_id][genre_name][tag_name] = 1

        return tensor

    def get_combined_recommendation(self):
        """
        Function to combine recommendations from all models based on frequency of appearance and order
        :param user_id:
        :return: list of recommended movies
        """
        recommended_movies = []
        model_list = ["SVD", "LDA", "PCA", "PageRank", "TD"]
        models_present = self.model_movies_dict.keys()
        models_absent = list(set(model_list) - set(models_present))
        for model in models_absent:
            self.model_movies_dict[model] = self.get_recommendation(model)
        model_movies_list = list(self.model_movies_dict.values())
        movie_dict = Counter()
        for movie_list in model_movies_list:
            for i in range(0, len(movie_list)):
                movie_dict[movie_list[i]] += 1 + (len(movie_list) - i) * 0.2
        movie_dict_sorted = sorted(movie_dict.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)
        movie_dict_sorted = movie_dict_sorted[0:5]
        for (m, v) in movie_dict_sorted:
            recommended_movies.append(m)

        return recommended_movies
class ActorMovieYearTensor(object):

    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = self.conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.ordered_years = []
        self.ordered_movie_names = []
        self.ordered_actor_names = []
        self.print_list = ["\n\nFor Years:", "\n\nFor Movies:", "\n\nFor Actors:"]
        self.util = Util()
        self.tensor = self.fetchActorMovieYearTensor()
        self.factors = self.util.CPDecomposition(self.tensor, 5)

    def fetchActorMovieYearTensor(self):
        """
        Create actor movie year tensor
        :return: tensor
        """
        movies_df = self.data_extractor.get_mlmovies_data()
        actor_df = self.data_extractor.get_movie_actor_data()

        movie_actor_df = actor_df.merge(movies_df, how="left", on="movieid")
        year_list = movie_actor_df["year"]
        year_count = 0
        year_dict = {}
        for element in year_list:
            if element in year_dict.keys():
                continue
            year_dict[element] = year_count
            year_count += 1
            self.ordered_years.append(element)

        movieid_list = movie_actor_df["movieid"]
        movieid_count = 0
        movieid_dict = {}
        for element in movieid_list:
            if element in movieid_dict.keys():
                continue
            movieid_dict[element] = movieid_count
            movieid_count += 1
            name = self.util.get_movie_name_for_id(element)
            self.ordered_movie_names.append(name)

        actorid_list = movie_actor_df["actorid"]
        actorid_count = 0
        actorid_dict = {}
        for element in actorid_list:
            if element in actorid_dict.keys():
                continue
            actorid_dict[element] = actorid_count
            actorid_count += 1
            name = self.util.get_actor_name_for_id(element)
            self.ordered_actor_names.append(name)

        tensor = np.zeros((year_count, movieid_count, actorid_count))

        for index, row in movie_actor_df.iterrows():
            year = row["year"]
            movieid = row["movieid"]
            actorid = row["actorid"]
            year_id = year_dict[year]
            movieid_id = movieid_dict[movieid]
            actorid_id = actorid_dict[actorid]
            tensor[year_id][movieid_id][actorid_id] = 1

        return tensor

    def print_latent_semantics(self, r):
        """
        Pretty print latent semantics
        :param r:
        """
        i = 0
        for factor in self.factors:
            print(self.print_list[i])
            latent_semantics = self.util.get_latent_semantics(r, factor.transpose())
            self.util.print_latent_semantics(latent_semantics, self.get_factor_names(i))
            i += 1

    def get_factor_names(self, i):
        """
        Obtain factor names
        :param i:
        :return: factor names
        """
        if i == 0:
            return self.ordered_years
        elif i == 1:
            return self.ordered_movie_names
        elif i == 2:
            return self.ordered_actor_names

    def get_partitions(self, no_of_partitions):
        """
        Partition factor matrices
        :param no_of_partitions:
        :return: list of groupings
        """
        i = 0
        groupings_list = []
        for factor in self.factors:
            groupings = self.util.partition_factor_matrix(factor, no_of_partitions, self.get_factor_names(i))
            groupings_list.append(groupings)
            i += 1

        return groupings_list

    def print_partitioned_entities(self, no_of_partitions):
        """
        Pretty print groupings
        :param no_of_partitions:
        """
        groupings_list = self.get_partitions(no_of_partitions)
        i = 0
        for groupings in groupings_list:
            print(self.print_list[i])
            self.util.print_partitioned_entities(groupings)
            i += 1