Exemplo n.º 1
0
class LdaGenreTag(GenreTag):
    def __init__(self):
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.util = Util()

    def get_lda_data(self, genre):
        """
        Does LDA on movie-tag counts and outputs movies in terms of latent semantics as U
        and tags in terms of latent semantics as Vh
        :param genre:
        :return: returns U and Vh
        """

        data_frame = self.get_genre_data().reset_index()
        genre_data_frame = data_frame[data_frame["genre"] == genre]
        tag_df = genre_data_frame.groupby(['movieid'
                                           ])['tag'].apply(list).reset_index()
        tag_df = tag_df.sort_values('movieid')
        tag_df = list(tag_df.iloc[:, 1])

        (U, Vh) = self.util.LDA(tag_df, num_topics=4, num_features=1000)

        for latent in Vh:
            print("\n")
            print(latent)
Exemplo n.º 2
0
class UserMovieRecommendation(object):
    def __init__(self, user_id):
        self.util = Util()
        self.genre_data = self.util.genre_data
        self.user_id = user_id
        self.watched_movies = self.util.get_all_movies_for_user(self.user_id)
        self.model_movies_dict = {}

    def get_movie_movie_matrix(self, model):
        """
        Finds movie_tag matrix and returns movie_movie_similarity matrix
        :param model:
        :return: movie_movie_similarity matrix
        """
        movie_latent_matrix = None
        movies = None
        if model == "LDA":
            movie_tag_data_frame = self.genre_data
            tag_df = movie_tag_data_frame.groupby(
                ['movieid'])['tag_string'].apply(list).reset_index()
            movies = tag_df.movieid.tolist()
            movies_tags_list = list(tag_df.tag_string)
            (U, Vh) = self.util.LDA(movies_tags_list,
                                    num_topics=10,
                                    num_features=len(
                                        self.genre_data.tag_string.unique()))
            movie_latent_matrix = self.util.get_doc_topic_matrix(
                U, num_docs=len(movies), num_topics=10)
        elif model == "SVD" or model == "PCA":
            movie_tag_frame = self.util.get_movie_tag_matrix()
            movie_tag_matrix = movie_tag_frame.values
            movies = list(movie_tag_frame.index.values)
            if model == "SVD":
                (U, s, Vh) = self.util.SVD(movie_tag_matrix)
                movie_latent_matrix = U[:, :10]
            else:
                (U, s, Vh) = self.util.PCA(movie_tag_matrix)
                tag_latent_matrix = U[:, :10]
                movie_latent_matrix = numpy.dot(movie_tag_matrix,
                                                tag_latent_matrix)
        elif model == "TD":
            tensor = self.fetch_movie_genre_tag_tensor()
            factors = self.util.CPDecomposition(tensor, 10)
            movies = self.genre_data["movieid"].unique()
            movies.sort()
            movie_latent_matrix = factors[0]
        elif model == "PageRank":
            movie_tag_frame = self.util.get_movie_tag_matrix()
            movie_tag_matrix = movie_tag_frame.values
            movies = list(movie_tag_frame.index.values)
            movie_latent_matrix = movie_tag_matrix
        latent_movie_matrix = movie_latent_matrix.transpose()
        movie_movie_matrix = numpy.dot(movie_latent_matrix,
                                       latent_movie_matrix)

        return movies, movie_movie_matrix

    def compute_pagerank(self):
        """
        Function to prepare data for pageRank and calling pageRank method
        :return: list of (movie,weight) tuple
        """
        (movies, movie_movie_matrix) = self.get_movie_movie_matrix("PageRank")
        seed_movies = self.watched_movies

        return self.util.compute_pagerank(seed_movies, movie_movie_matrix,
                                          movies)

    def get_recommendation(self, model):
        """
        Function to recommend movies for a given user_id based on the given model
        :param user_id:
        :param model:
        :return: list of movies for the given user as a recommendation
        """
        recommended_movies = []
        if len(self.watched_movies) == 0:
            print("THIS USER HAS NOT WATCHED ANY MOVIE.\nAborting...")
            exit(1)
        if model == "PageRank":
            recommended_dict = self.compute_pagerank()
            for movie_p, weight_p in recommended_dict:
                if len(recommended_movies) == 5:
                    break
                if movie_p not in self.watched_movies:
                    recommended_movies.append(movie_p)
        elif model == "Combination":
            return self.get_combined_recommendation()
        elif model == "SVD" or model == "PCA" or model == "LDA" or model == "TD":
            (movies, movie_movie_matrix) = self.get_movie_movie_matrix(model)
            movie_row_dict = {}
            for i in range(0, len(movies)):
                if movies[i] in self.watched_movies:
                    movie_row_dict[movies[i]] = movie_movie_matrix[i]
            distribution_list = self.util.get_distribution_count(
                self.watched_movies, 5)
            index = 0
            for movie in self.watched_movies:
                movie_row = movie_row_dict[movie]
                labelled_movie_row = dict(zip(movies, movie_row))
                num_of_movies_to_pick = distribution_list[index]
                for each in self.watched_movies:
                    del labelled_movie_row[each]
                for each in recommended_movies:
                    del labelled_movie_row[each]
                labelled_movie_row_sorted = sorted(labelled_movie_row.items(),
                                                   key=operator.itemgetter(1),
                                                   reverse=True)
                labelled_movie_row_sorted = labelled_movie_row_sorted[
                    0:num_of_movies_to_pick]
                for (m, v) in labelled_movie_row_sorted:
                    recommended_movies.append(m)
                if len(recommended_movies) == 5:
                    break
                index += 1

        return recommended_movies

    def fetch_movie_genre_tag_tensor(self):
        """
        Create Movie Genre Tag tensor
        :return: tensor
        """
        movie_list = self.genre_data["movieid"].unique()
        movie_list.sort()
        movie_count = 0
        movie_dict = {}
        for element in movie_list:
            movie_dict[element] = movie_count
            movie_count += 1

        genre_list = self.genre_data["genre"].unique()
        genre_list.sort()
        genre_count = 0
        genre_dict = {}
        for element in genre_list:
            genre_dict[element] = genre_count
            genre_count += 1

        user_df = self.genre_data[self.genre_data['movieid'].isin(
            self.watched_movies)]
        tag_list = user_df["tag_string"].unique()
        tag_list.sort()
        tag_count = 0
        tag_dict = {}
        for element in tag_list:
            tag_dict[element] = tag_count
            tag_count += 1

        tensor = numpy.zeros((movie_count, genre_count, tag_count))

        for index, row in self.genre_data.iterrows():
            movie = row["movieid"]
            genre = row["genre"]
            tag = row["tag_string"]
            if genre not in genre_list or tag not in tag_list:
                continue
            movie_id = movie_dict[movie]
            genre_name = genre_dict[genre]
            tag_name = tag_dict[tag]
            tensor[movie_id][genre_name][tag_name] = 1

        return tensor

    def get_combined_recommendation(self):
        """
        Function to combine recommendations from all models based on frequency of appearance and order
        :param user_id:
        :return: list of recommended movies
        """
        recommended_movies = []
        model_list = ["SVD", "LDA", "PCA", "PageRank", "TD"]
        models_present = self.model_movies_dict.keys()
        models_absent = list(set(model_list) - set(models_present))
        for model in models_absent:
            self.model_movies_dict[model] = self.get_recommendation(model)
        model_movies_list = list(self.model_movies_dict.values())
        movie_dict = Counter()
        for movie_list in model_movies_list:
            for i in range(0, len(movie_list)):
                movie_dict[movie_list[i]] += 1 + (len(movie_list) - i) * 0.2
        movie_dict_sorted = sorted(movie_dict.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)
        movie_dict_sorted = movie_dict_sorted[0:5]
        for (m, v) in movie_dict_sorted:
            recommended_movies.append(m)

        return recommended_movies
Exemplo n.º 3
0
class SimilarActorsFromDiffMoviesLda(object):
    def __init__(self):
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.util = Util()
        self.sim_act_diff_mov_tf = SimilarActorsFromDiffMovies()

    def most_similar_actors_lda(self, moviename):
        """
        Function to find related actors from related movies(movie_movie_similarity_matrix using lda)
        corresponding to the given movie
        :param moviename:
        :return: actors
        """
        data_frame = self.data_extractor.get_mlmovies_data()
        tag_data_frame = self.data_extractor.get_genome_tags_data()
        movie_data_frame = self.data_extractor.get_mltags_data()
        movie_tag_data_frame = movie_data_frame.merge(tag_data_frame, how="left", left_on="tagid", right_on="tagId")
        movie_tag_data_frame = movie_tag_data_frame.merge(data_frame, how="left", left_on="movieid", right_on="movieid")
        tag_df = movie_tag_data_frame.groupby(['movieid'])['tag'].apply(list).reset_index()

        tag_df = tag_df.sort_values('movieid')
        movies = tag_df.movieid.tolist()
        tag_df = list(tag_df.iloc[:, 1])

        input_movieid = self.util.get_movie_id(moviename)

        (U, Vh) = self.util.LDA(tag_df, num_topics=5, num_features=1000)

        movie_topic_matrix = self.util.get_doc_topic_matrix(U, num_docs=len(movies), num_topics=5)
        topic_movie_matrix = movie_topic_matrix.transpose()
        movie_movie_matrix = numpy.dot(movie_topic_matrix, topic_movie_matrix)

        index_movie = None
        for i, j in enumerate(movies):
            if j == input_movieid:
                index_movie = i
                break

        if index_movie == None:
            print("Movie Id not found.")
            return None

        movie_row = movie_movie_matrix[index_movie].tolist()
        movie_movie_dict = dict(zip(movies, movie_row))
        del movie_movie_dict[input_movieid]

        for key in movie_movie_dict.keys():
            movie_movie_dict[key] = abs(movie_movie_dict[key])

        movie_movie_dict = sorted(movie_movie_dict.items(), key=operator.itemgetter(1), reverse=True)

        if movie_movie_dict == None:
            return None
        actors = []
        for (movie, val) in movie_movie_dict:
            if val <= 0:
                break
            actors = actors + self.sim_act_diff_mov_tf.get_actors_of_movie(self.util.get_movie_name_for_id(movie))
            if len(actors) >= 10:
                break

        actors_of_given_movie = self.sim_act_diff_mov_tf.get_actors_of_movie(moviename)

        actorsFinal = [x for x in actors if x not in actors_of_given_movie]

        actornames = []
        for actorid in actorsFinal:
            actor = self.util.get_actor_name_for_id(actorid)
            actornames.append(actor)

        return actornames
Exemplo n.º 4
0
class LdaActorTag(object):
    def __init__(self):
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.util = Util()

    def get_related_actors_lda(self, actorid):
        """
        Function to find similarity between actors using actor-actor similarity vector in tag space using lda
        :param actorid:
        :return:
        """
        mov_act = self.data_extractor.get_movie_actor_data()
        ml_tag = self.data_extractor.get_mltags_data()
        genome_tag = self.data_extractor.get_genome_tags_data()
        actor_info = self.data_extractor.get_imdb_actor_info_data()
        actor_movie_info = mov_act.merge(actor_info,
                                         how="left",
                                         left_on="actorid",
                                         right_on="id")
        tag_data_frame = ml_tag.merge(genome_tag,
                                      how="left",
                                      left_on="tagid",
                                      right_on="tagId")
        merged_data_frame = tag_data_frame.merge(actor_movie_info,
                                                 how="left",
                                                 on="movieid")

        merged_data_frame = merged_data_frame.fillna('')
        tag_df = merged_data_frame.groupby(
            ['actorid'])['tag'].apply(list).reset_index()

        tag_df = tag_df.sort_values('actorid')
        actorid_list = tag_df.actorid.tolist()
        tag_df = list(tag_df.iloc[:, 1])

        (U, Vh) = self.util.LDA(tag_df, num_topics=5, num_features=100000)

        actor_topic_matrix = self.util.get_doc_topic_matrix(
            U, num_docs=len(actorid_list), num_topics=5)
        topic_actor_matrix = actor_topic_matrix.transpose()
        actor_actor_matrix = numpy.dot(actor_topic_matrix, topic_actor_matrix)

        numpy.savetxt("actor_actor_matrix_with_svd_latent_values.csv",
                      actor_actor_matrix,
                      delimiter=",")

        df = pd.DataFrame(
            pd.read_csv('actor_actor_matrix_with_svd_latent_values.csv',
                        header=None))
        matrix = df.values

        actorids = self.util.get_sorted_actor_ids()

        index_actor = None
        for i, j in enumerate(actorids):
            if j == actorid:
                index_actor = i
                break

        if index_actor == None:
            print("Actor Id not found.")
            return None

        actor_names = []
        for actor_id in actorids:
            actor_name = self.util.get_actor_name_for_id(int(actor_id))
            actor_names = actor_names + [actor_name]

        actor_row = matrix[index_actor].tolist()
        actor_actor_dict = dict(zip(actor_names, actor_row))
        del actor_actor_dict[self.util.get_actor_name_for_id(int(actorid))]

        # for key in actor_actor_dict.keys():
        #     actor_actor_dict[key] = abs(actor_actor_dict[key])

        actor_actor_dict = sorted(actor_actor_dict.items(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
        print(actor_actor_dict[0:10])
        return actor_actor_dict[0:10]