def buildDF(): movieCount = movie_tag_map.keys()._len_() createDictionaries1() tagList = sorted(list(tag_movie_map.keys())) dfList = [] movieList = [] all_movie_sorted = sorted(list(movie_tag_map.keys())) for movie in all_movie_sorted: tagsInMovie = movie_tag_map[movie] tf_idf_map = dict() if tagsInMovie: movieList.append(movie) for tag in tagList: moviesInTagCount = len(tag_movie_map[tag]) tf_numerator = 0 for temp_movie, datetime in tag_movie_map[tag]: if movie == temp_movie: tf_numerator += formatter.normalizer( min_date, max_date, datetime) tf = tf_numerator / len(tagsInMovie) tf_idf = tf * math.log2(movieCount / moviesInTagCount) tf_idf_map[tag] = tf_idf dfList.append(tf_idf_map) return dfList, tagList, movieList
def load_genre_matrix(given_genre): movieCount = movie_tag_map.keys().__len__() createDictionaries1() tagList = sorted(list(tag_movie_map.keys())) movieList = [] df = pd.DataFrame(columns=tagList) for movie in genre_movie_map[given_genre]: tagsInMovie = movie_tag_map[movie] tf_idf_map = dict() if tagsInMovie: movieList.append(movie) for tag in tagList: moviesInTagCount = len(tag_movie_map[tag]) tf_numerator = 0 for temp_movie, datetime in tag_movie_map[tag]: if movie == temp_movie: tf_numerator += formatter.normalizer( min_date, max_date, datetime) tf = tf_numerator / len(tagsInMovie) tf_idf = tf * math.log2(movieCount / moviesInTagCount) tf_idf_map[tag] = tf_idf df = df.append(tf_idf_map, ignore_index=True) df.index = movieList return df
def load_genre_actor_matrix(given_genre): global max_rank global min_rank global tag_count global max_date global min_date createDictionaries1() actorList = sorted(list(actor_movie_rank_map.keys())) df = pd.DataFrame(columns=actorList) movieCount = movie_tag_map.keys().__len__() movieList = [] for movieInGenre in genre_movie_map[given_genre]: movieList.append(movieInGenre) actorsInMovieList = movie_actor_rank_map[movieInGenre] actorCountOfMovie = len(actorsInMovieList) tf_idf_map = dict.fromkeys(actorList, 0.0) for actor, rank in actorsInMovieList: movieCountOfActor = len(actor_movie_rank_map[actor]) tf_numerator = (1 / formatter.normalizer(min_rank, max_rank, rank)) tf_idf = (tf_numerator / actorCountOfMovie) * math.log2( movieCount / movieCountOfActor) tf_idf_map[actor] = tf_idf df = df.append(tf_idf_map, ignore_index=True) df.index = movieList return df
def actor_tagVector(): global max_rank global min_rank for row in movie_actor_df.itertuples(): if row.actor_movie_rank < min_rank: min_rank = row.actor_movie_rank if row.actor_movie_rank > max_rank: max_rank = row.actor_movie_rank actor_movie_rank_map[row.actorid].add( (row.movieid, row.actor_movie_rank)) movie_actor_rank_map[row.movieid].add( (row.actorid, row.actor_movie_rank)) total_actor_count = len(actor_movie_rank_map) for actorID, movies_list in actor_movie_rank_map.items(): tag_counter = 0 tag_weight_tuple_tf = defaultdict(float) tag_weight_tuple_tf_idf = defaultdict(float) for movie in movies_list: tag_counter += len(movie_tag_map[movie[0]]) for movieID, rank in movies_list: if movieID in movie_tag_map: for tag_id, timestamp in movie_tag_map[movieID]: actor_count = 0 aSetOfTags = set() for mov in tag_movie_map[tag_id]: aSetOfTags.update( [k for (k, v) in movie_actor_rank_map[mov[0]]]) actor_count = aSetOfTags.__len__() tf = (formatter.normalizer(min_date, max_date, timestamp) / formatter.normalizer(min_rank, max_rank, rank)) / tag_counter tag_weight_tuple_tf[tag_id] += tf tag_weight_tuple_tf_idf[tag_id] += tf * math.log2( total_actor_count / actor_count) actor_weight_vector_tf_idf[actorID] = [ (k, v) for k, v in tag_weight_tuple_tf_idf.items() ] return actor_weight_vector_tf_idf
def runAllMethods(userid): global sem_matrix_list global q_vectorList functions = [loadPCASemantics, loadSVDSemantics, loadCPSemantics] allSimilarities = [] for i in range(1, 6): similarity = list() if (i <= 3): similarity_semantic_matrix = functions[i - 1]() similarity_semantic_matrix = ((similarity_semantic_matrix - similarity_semantic_matrix.min(axis=0) + 0.00001) \ / (similarity_semantic_matrix.max(axis=0) - similarity_semantic_matrix.min(axis=0) + 0.00001)) vector = np.take(similarity_semantic_matrix, indx, axis=0) q_vector = vector.astype(np.float32) aug_sim_matx = np.delete(similarity_semantic_matrix, indx, axis=0).astype(np.float32) sem_matrix_list.append(aug_sim_matx) q_vectorList.append(q_vector) distance = [] for v in q_vector: distance.append(euclideanMatrixVector(aug_sim_matx, v)) distance = np.array(distance) distance = (distance - distance.min() + 0.00001) / ( (distance.max() - distance.min() + 0.00001)) similarity = 1. / distance + 0.00001 similarity = list( similarity.T.dot(finalWeights).astype(np.float32)) if i == 4: movie_movie_similarity_subset_new = runLDADecomposition( userid) #update similarity = list( movie_movie_similarity_subset_new.T.dot(finalWeights).astype( np.float32)) if i == 5: movieRatedSeed = list( zip(moviesWatched, finalWeights)) #DataHandler.userMovieOrders(userId) P = DataHandler.load_movie_tag_df( ) #DataHandler.load_movie_tag_df() moviesList = sorted(list(DataHandler.movie_actor_rank_map.keys())) euclidean_distance = pairwise.euclidean_distances(P) epsilon = np.matrix(np.zeros(euclidean_distance.shape) + 0.000001) movie_movie_similarity = 1 / (epsilon + euclidean_distance) movie_movie_similarity = pd.DataFrame(movie_movie_similarity) prData = ppr.personalizedPageRankWeighted(movie_movie_similarity, movieRatedSeed, 0.9) moviesNotWATCHED = list(set(moviesList) - set(moviesWatched)) moviesNotWATCHED_indices = [ moviesList.index(i) for i in moviesNotWATCHED ] similarity = list(prData.loc[moviesNotWATCHED_indices, ][0]) similarity = [ formatter.normalizer(min(similarity), max(similarity), value) for value in similarity ] allSimilarities.append(similarity) similarities = np.array(allSimilarities).mean(axis=0) return np.argsort(similarities)[::-1], np.sort(similarities)[::-1]