class LdaGenreTag(GenreTag): def __init__(self): super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get( "data_set_loc") self.util = Util() def get_lda_data(self, genre): """ Does LDA on movie-tag counts and outputs movies in terms of latent semantics as U and tags in terms of latent semantics as Vh :param genre: :return: returns U and Vh """ data_frame = self.get_genre_data().reset_index() genre_data_frame = data_frame[data_frame["genre"] == genre] tag_df = genre_data_frame.groupby(['movieid' ])['tag'].apply(list).reset_index() tag_df = tag_df.sort_values('movieid') tag_df = list(tag_df.iloc[:, 1]) (U, Vh) = self.util.LDA(tag_df, num_topics=4, num_features=1000) for latent in Vh: print("\n") print(latent)
class UserMovieRecommendation(object): def __init__(self, user_id): self.util = Util() self.genre_data = self.util.genre_data self.user_id = user_id self.watched_movies = self.util.get_all_movies_for_user(self.user_id) self.model_movies_dict = {} def get_movie_movie_matrix(self, model): """ Finds movie_tag matrix and returns movie_movie_similarity matrix :param model: :return: movie_movie_similarity matrix """ movie_latent_matrix = None movies = None if model == "LDA": movie_tag_data_frame = self.genre_data tag_df = movie_tag_data_frame.groupby( ['movieid'])['tag_string'].apply(list).reset_index() movies = tag_df.movieid.tolist() movies_tags_list = list(tag_df.tag_string) (U, Vh) = self.util.LDA(movies_tags_list, num_topics=10, num_features=len( self.genre_data.tag_string.unique())) movie_latent_matrix = self.util.get_doc_topic_matrix( U, num_docs=len(movies), num_topics=10) elif model == "SVD" or model == "PCA": movie_tag_frame = self.util.get_movie_tag_matrix() movie_tag_matrix = movie_tag_frame.values movies = list(movie_tag_frame.index.values) if model == "SVD": (U, s, Vh) = self.util.SVD(movie_tag_matrix) movie_latent_matrix = U[:, :10] else: (U, s, Vh) = self.util.PCA(movie_tag_matrix) tag_latent_matrix = U[:, :10] movie_latent_matrix = numpy.dot(movie_tag_matrix, tag_latent_matrix) elif model == "TD": tensor = self.fetch_movie_genre_tag_tensor() factors = self.util.CPDecomposition(tensor, 10) movies = self.genre_data["movieid"].unique() movies.sort() movie_latent_matrix = factors[0] elif model == "PageRank": movie_tag_frame = self.util.get_movie_tag_matrix() movie_tag_matrix = movie_tag_frame.values movies = list(movie_tag_frame.index.values) movie_latent_matrix = movie_tag_matrix latent_movie_matrix = movie_latent_matrix.transpose() movie_movie_matrix = numpy.dot(movie_latent_matrix, latent_movie_matrix) return movies, movie_movie_matrix def compute_pagerank(self): """ Function to prepare data for pageRank and calling pageRank method :return: list of (movie,weight) tuple """ (movies, movie_movie_matrix) = self.get_movie_movie_matrix("PageRank") seed_movies = self.watched_movies return self.util.compute_pagerank(seed_movies, movie_movie_matrix, movies) def get_recommendation(self, model): """ Function to recommend movies for a given user_id based on the given model :param user_id: :param model: :return: list of movies for the given user as a recommendation """ recommended_movies = [] if len(self.watched_movies) == 0: print("THIS USER HAS NOT WATCHED ANY MOVIE.\nAborting...") exit(1) if model == "PageRank": recommended_dict = self.compute_pagerank() for movie_p, weight_p in recommended_dict: if len(recommended_movies) == 5: break if movie_p not in self.watched_movies: recommended_movies.append(movie_p) elif model == "Combination": return self.get_combined_recommendation() elif model == "SVD" or model == "PCA" or model == "LDA" or model == "TD": (movies, movie_movie_matrix) = self.get_movie_movie_matrix(model) movie_row_dict = {} for i in range(0, len(movies)): if movies[i] in self.watched_movies: movie_row_dict[movies[i]] = movie_movie_matrix[i] distribution_list = self.util.get_distribution_count( self.watched_movies, 5) index = 0 for movie in self.watched_movies: movie_row = movie_row_dict[movie] labelled_movie_row = dict(zip(movies, movie_row)) num_of_movies_to_pick = distribution_list[index] for each in self.watched_movies: del labelled_movie_row[each] for each in recommended_movies: del labelled_movie_row[each] labelled_movie_row_sorted = sorted(labelled_movie_row.items(), key=operator.itemgetter(1), reverse=True) labelled_movie_row_sorted = labelled_movie_row_sorted[ 0:num_of_movies_to_pick] for (m, v) in labelled_movie_row_sorted: recommended_movies.append(m) if len(recommended_movies) == 5: break index += 1 return recommended_movies def fetch_movie_genre_tag_tensor(self): """ Create Movie Genre Tag tensor :return: tensor """ movie_list = self.genre_data["movieid"].unique() movie_list.sort() movie_count = 0 movie_dict = {} for element in movie_list: movie_dict[element] = movie_count movie_count += 1 genre_list = self.genre_data["genre"].unique() genre_list.sort() genre_count = 0 genre_dict = {} for element in genre_list: genre_dict[element] = genre_count genre_count += 1 user_df = self.genre_data[self.genre_data['movieid'].isin( self.watched_movies)] tag_list = user_df["tag_string"].unique() tag_list.sort() tag_count = 0 tag_dict = {} for element in tag_list: tag_dict[element] = tag_count tag_count += 1 tensor = numpy.zeros((movie_count, genre_count, tag_count)) for index, row in self.genre_data.iterrows(): movie = row["movieid"] genre = row["genre"] tag = row["tag_string"] if genre not in genre_list or tag not in tag_list: continue movie_id = movie_dict[movie] genre_name = genre_dict[genre] tag_name = tag_dict[tag] tensor[movie_id][genre_name][tag_name] = 1 return tensor def get_combined_recommendation(self): """ Function to combine recommendations from all models based on frequency of appearance and order :param user_id: :return: list of recommended movies """ recommended_movies = [] model_list = ["SVD", "LDA", "PCA", "PageRank", "TD"] models_present = self.model_movies_dict.keys() models_absent = list(set(model_list) - set(models_present)) for model in models_absent: self.model_movies_dict[model] = self.get_recommendation(model) model_movies_list = list(self.model_movies_dict.values()) movie_dict = Counter() for movie_list in model_movies_list: for i in range(0, len(movie_list)): movie_dict[movie_list[i]] += 1 + (len(movie_list) - i) * 0.2 movie_dict_sorted = sorted(movie_dict.items(), key=operator.itemgetter(1), reverse=True) movie_dict_sorted = movie_dict_sorted[0:5] for (m, v) in movie_dict_sorted: recommended_movies.append(m) return recommended_movies
class SimilarActorsFromDiffMoviesLda(object): def __init__(self): super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.util = Util() self.sim_act_diff_mov_tf = SimilarActorsFromDiffMovies() def most_similar_actors_lda(self, moviename): """ Function to find related actors from related movies(movie_movie_similarity_matrix using lda) corresponding to the given movie :param moviename: :return: actors """ data_frame = self.data_extractor.get_mlmovies_data() tag_data_frame = self.data_extractor.get_genome_tags_data() movie_data_frame = self.data_extractor.get_mltags_data() movie_tag_data_frame = movie_data_frame.merge(tag_data_frame, how="left", left_on="tagid", right_on="tagId") movie_tag_data_frame = movie_tag_data_frame.merge(data_frame, how="left", left_on="movieid", right_on="movieid") tag_df = movie_tag_data_frame.groupby(['movieid'])['tag'].apply(list).reset_index() tag_df = tag_df.sort_values('movieid') movies = tag_df.movieid.tolist() tag_df = list(tag_df.iloc[:, 1]) input_movieid = self.util.get_movie_id(moviename) (U, Vh) = self.util.LDA(tag_df, num_topics=5, num_features=1000) movie_topic_matrix = self.util.get_doc_topic_matrix(U, num_docs=len(movies), num_topics=5) topic_movie_matrix = movie_topic_matrix.transpose() movie_movie_matrix = numpy.dot(movie_topic_matrix, topic_movie_matrix) index_movie = None for i, j in enumerate(movies): if j == input_movieid: index_movie = i break if index_movie == None: print("Movie Id not found.") return None movie_row = movie_movie_matrix[index_movie].tolist() movie_movie_dict = dict(zip(movies, movie_row)) del movie_movie_dict[input_movieid] for key in movie_movie_dict.keys(): movie_movie_dict[key] = abs(movie_movie_dict[key]) movie_movie_dict = sorted(movie_movie_dict.items(), key=operator.itemgetter(1), reverse=True) if movie_movie_dict == None: return None actors = [] for (movie, val) in movie_movie_dict: if val <= 0: break actors = actors + self.sim_act_diff_mov_tf.get_actors_of_movie(self.util.get_movie_name_for_id(movie)) if len(actors) >= 10: break actors_of_given_movie = self.sim_act_diff_mov_tf.get_actors_of_movie(moviename) actorsFinal = [x for x in actors if x not in actors_of_given_movie] actornames = [] for actorid in actorsFinal: actor = self.util.get_actor_name_for_id(actorid) actornames.append(actor) return actornames
class LdaActorTag(object): def __init__(self): super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get( "data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.util = Util() def get_related_actors_lda(self, actorid): """ Function to find similarity between actors using actor-actor similarity vector in tag space using lda :param actorid: :return: """ mov_act = self.data_extractor.get_movie_actor_data() ml_tag = self.data_extractor.get_mltags_data() genome_tag = self.data_extractor.get_genome_tags_data() actor_info = self.data_extractor.get_imdb_actor_info_data() actor_movie_info = mov_act.merge(actor_info, how="left", left_on="actorid", right_on="id") tag_data_frame = ml_tag.merge(genome_tag, how="left", left_on="tagid", right_on="tagId") merged_data_frame = tag_data_frame.merge(actor_movie_info, how="left", on="movieid") merged_data_frame = merged_data_frame.fillna('') tag_df = merged_data_frame.groupby( ['actorid'])['tag'].apply(list).reset_index() tag_df = tag_df.sort_values('actorid') actorid_list = tag_df.actorid.tolist() tag_df = list(tag_df.iloc[:, 1]) (U, Vh) = self.util.LDA(tag_df, num_topics=5, num_features=100000) actor_topic_matrix = self.util.get_doc_topic_matrix( U, num_docs=len(actorid_list), num_topics=5) topic_actor_matrix = actor_topic_matrix.transpose() actor_actor_matrix = numpy.dot(actor_topic_matrix, topic_actor_matrix) numpy.savetxt("actor_actor_matrix_with_svd_latent_values.csv", actor_actor_matrix, delimiter=",") df = pd.DataFrame( pd.read_csv('actor_actor_matrix_with_svd_latent_values.csv', header=None)) matrix = df.values actorids = self.util.get_sorted_actor_ids() index_actor = None for i, j in enumerate(actorids): if j == actorid: index_actor = i break if index_actor == None: print("Actor Id not found.") return None actor_names = [] for actor_id in actorids: actor_name = self.util.get_actor_name_for_id(int(actor_id)) actor_names = actor_names + [actor_name] actor_row = matrix[index_actor].tolist() actor_actor_dict = dict(zip(actor_names, actor_row)) del actor_actor_dict[self.util.get_actor_name_for_id(int(actorid))] # for key in actor_actor_dict.keys(): # actor_actor_dict[key] = abs(actor_actor_dict[key]) actor_actor_dict = sorted(actor_actor_dict.items(), key=operator.itemgetter(1), reverse=True) print(actor_actor_dict[0:10]) return actor_actor_dict[0:10]