def genre_spaceActors_LDA_tf(genre): DataHandler.vectors() DataHandler.createDictionaries1() movie_tag_map, tag_id_map, actor_movie_rank_map, movie_actor_rank_map = DataHandler.get_dicts( ) DataHandler.create_actor_actorid_map() actor_actorid_map = DataHandler.actor_actorid_map df = DataHandler.load_genre_actor_matrix_tf(genre) gmMap = DataHandler.genre_movie_map if (genre not in list(gmMap.keys())): print("genre " + genre + " not in data") return ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition( df, 4, constants.genreActorSpacePasses) topic_terms = defaultdict(set) for i in range(0, 4): for tuples in ldaModel.get_topic_terms( i, topn=len(actor_actorid_map) ): #get_topics_terms returns top n(default = 10) words of the topics term = id_Term_map.get(tuples[0]) topic_terms[i].add((actor_actorid_map.get(term), tuples[1])) for i in range(0, 4): print('Semantic ' + str(i + 1) + ' ' + str(sorted(topic_terms.get(i), key=itemgetter(1), reverse=True))) print('\n')
def top10_Actors_LDA(givenActor): DataHandler.create_actor_actorid_map() top10SimilarActors_similarity = DataHandler.similarActors_LDA(givenActor) print('Actors similar to ' + str(DataHandler.actor_actorid_map[givenActor])) for actor, sim in top10SimilarActors_similarity: print(DataHandler.actor_actorid_map[actor] + ' ' + str(sim))
def task1c_tfidf(actor_id): DataHandler.vectors() actorTagDataframe = DataHandler.actor_tag_df() actorsTags = np.matrix(actorTagDataframe.as_matrix()).tolist() actorIndexList = list(actorTagDataframe.index) if (actor_id not in actorIndexList): print("Invalid actor id or no tags present for actor. Returning") return simAndActor = [] concernedActor = actorsTags[actorIndexList.index(actor_id)] totalActors = len(actorIndexList) DataHandler.create_actor_actorid_map() for index in range(0, totalActors): comparisonActorId = actorIndexList[index] if(actor_id == comparisonActorId): continue comparisonActorName = DataHandler.actor_actorid_map.get(comparisonActorId) comparisonActor = actorsTags[index] comparisonScore = metrics.l2Norm(concernedActor, comparisonActor) simAndActor.append((comparisonScore, comparisonActorName)) result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False) top10Actors = result[0:10] print("Top 10 actors similar to " + str(DataHandler.actor_actorid_map.get(actor_id)) + " are: ") for tup in top10Actors: print(tup[1] + " : " + str(tup[0])) return
def task1c_pca(actor_id): DataHandler.vectors() actorTagDataframe = DataHandler.actor_tag_df() actorTagMatrix = np.matrix(DataHandler.actor_tag_df().as_matrix()) actorIndexList = list(actorTagDataframe.index) if (actor_id not in actorIndexList): print("Invalid actor id or no tags present for actor. Returning") return components = decompositions.PCADecomposition(actorTagDataframe, 5) #using transpose since according to page 158, p inverse = p transpose pMatrix = np.matrix(components).transpose() actorsInSemantics = (actorTagMatrix * pMatrix).tolist() simAndActor = [] concernedActorInSemantics = actorsInSemantics[actorIndexList.index(actor_id)] DataHandler.create_actor_actorid_map() for index in range(0, len(actorsInSemantics)): comparisonActorId = actorIndexList[index] if (actor_id == comparisonActorId): continue comparisonActorSemantics = actorsInSemantics[index] comparisonActorName = DataHandler.actor_actorid_map.get(comparisonActorId) simAndActor.append((metrics.l2Norm(concernedActorInSemantics, comparisonActorSemantics), comparisonActorName)) result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False) top10Actors = result[0:10] print("Top 10 actors similar to " + str(DataHandler.actor_actorid_map.get(actor_id)) + " are: ") for tup in top10Actors: print(tup[1] + " : " + str(tup[0])) return
def actor_task1c_SVD(actor_id): DataHandler.vectors() acdf = DataHandler.actor_tag_df() indexList=list(acdf.index) if (actor_id not in indexList): print("Invalid actor id or no tags present for actor. Returning") return U, Sigma, VT = decompositions.SVDDecomposition(acdf, 5) simAndActor = [] actorInSemantics = U[indexList.index(actor_id)] DataHandler.create_actor_actorid_map() for index in range(0, len(U)): comparisonActorId = indexList[index] if (comparisonActorId == actor_id): continue actorName = DataHandler.actor_actorid_map.get(comparisonActorId) similarityScore = metrics.l2Norm(actorInSemantics, U[index]) simAndActor.append((similarityScore, actorName)) result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False) print("Top 10 Actors similar to " + str(DataHandler.actor_actorid_map.get(actor_id)) + " are:") top10Actors = result[0:10] for tup in top10Actors: print(tup[1] + " : " + str(tup[0])) return
def top10_Actors_LDA_tf(givenActor): DataHandler.createDictionaries1() actor_movie_rank_map = DataHandler.actor_movie_rank_map if givenActor not in actor_movie_rank_map: print('Invalid seed actor id : '+str(givenActor)) return DataHandler.create_actor_actorid_map() top10SimilarActors_similarity = DataHandler.similarActors_LDA_tf(givenActor) print('Actors similar to '+str(DataHandler.actor_actorid_map[givenActor])) for actor,sim in top10SimilarActors_similarity: print(DataHandler.actor_actorid_map[actor]+' '+str(sim)) return
def task1dImplementation_SVD(movie_id): DataHandler.vectors() DataHandler.createDictionaries1() movieid_name_map = DataHandler.movieid_name_map actor_tag_df = DataHandler.actor_tag_df() movie_tag_df = DataHandler.load_movie_tag_df() moviesIndexList = list(movie_tag_df.index) actorsIndexList = list(actor_tag_df.index) actorsSize = len(actorsIndexList) if (movie_id not in moviesIndexList): print("Movie " + movieid_name_map.get(movie_id) + " not present in mltags data. Quitting") return actorU, actorSigma, actorV = decompositions.SVDDecomposition( actor_tag_df, 5) tagsToActorSemantics = (np.matrix(actorV)).transpose() movieTagMatrix = np.matrix(movie_tag_df.as_matrix()) movieInTags = movieTagMatrix[moviesIndexList.index(movie_id)] movieInActorSemantics = (movieInTags * tagsToActorSemantics).tolist()[0] actorsInSemantics = np.matrix(actorU) actorsWithScores = [] DataHandler.create_actor_actorid_map() actorsForMovie = DataHandler.movie_actor_map.get(movie_id) for index in range(0, actorsSize): actor_id = actorsIndexList[index] if actor_id in actorsForMovie: continue actorMatrix = actorsInSemantics[index] actor = (actorMatrix.tolist())[0] actorName = DataHandler.actor_actorid_map.get(actor_id) similarityScore = metrics.l2Norm(actor, movieInActorSemantics) actorsWithScores.append((similarityScore, actorName)) resultActors = sorted(actorsWithScores, key=operator.itemgetter(0), reverse=False) top10Actors = resultActors[0:10] print("10 Actors similar to movie " + str(movieid_name_map.get(movie_id)) + " are: ") for tup in top10Actors: print(tup[1] + " : " + str(tup[0])) return
def task1d_pca(movie_id): DataHandler.vectors() DataHandler.createDictionaries1() movieid_name_map = DataHandler.movieid_name_map actor_tag_df = DataHandler.actor_tag_df() movie_tag_df = DataHandler.load_movie_tag_df() actorTagMatrix = np.matrix(actor_tag_df.as_matrix()) movieTagMatrix = np.matrix(movie_tag_df.as_matrix()) actorIndexList = list(actor_tag_df.index) movieIndexList = list(movie_tag_df.index) if (movie_id not in movieIndexList): print("Movie " + movieid_name_map.get(movie_id) + " not present in mltags data. Quitting") return actorSemantics = decompositions.PCADecomposition(actor_tag_df, 5) actorP = np.matrix(actorSemantics).transpose() movieInTags = movieTagMatrix[movieIndexList.index(movie_id)] movieInActorSemantics = (movieInTags * actorP).tolist()[0] actorsInActorSemantics = (actorTagMatrix * actorP).tolist() DataHandler.create_actor_actorid_map() actorsForMovie = DataHandler.movie_actor_map.get(movie_id) DataHandler.create_actor_actorid_map() actorsSize = len(actorsInActorSemantics) simAndActor = [] for index in range(0, actorsSize): actorId = actorIndexList[index] if (actorId in actorsForMovie): continue actorInSemantics = actorsInActorSemantics[index] actorName = DataHandler.actor_actorid_map.get(actorId) score = metrics.l2Norm(actorInSemantics, movieInActorSemantics) simAndActor.append((score, actorName)) result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False) print("Top 10 actors similar to movie: " + str(movieid_name_map.get(movie_id)) + " are: ") top10Actors = result[0:10] for tup in top10Actors: print(tup[1] + " : " + str(tup[0])) return
def PersnalizedPageRank_top10_SimilarCoActors(seed): DataHandler.createDictionaries1() DataHandler.create_actor_actorid_map() coactcoact, ignoreVariable = DataHandler.coactor_siilarity_matrix() actor_actorid_map = DataHandler.actor_actorid_map alpha = constants.ALPHA act_similarities = ppr.personalizedPageRank(coactcoact,seed,alpha) actors = list(coactcoact.index) actorDF = pd.DataFrame(pd.Series(actors),columns = ['Actor']) actorDF['Actor'] = actorDF['Actor'].map(lambda x:actor_actorid_map.get(x)) Result = pd.concat([act_similarities,actorDF],axis = 1) sortedResult=Result.sort_values(by=0,ascending=False).head(15) seedAcotorNames = [actor_actorid_map.get(i) for i in seed] print('Co Actors similar to the following seed actors: '+str(seedAcotorNames)) for index in sortedResult.index: if sortedResult.loc[index,'Actor'] not in seedAcotorNames: print(sortedResult.loc[index,'Actor']+' '+ str(sortedResult.loc[index,0]))
def task1d_tfidf(movie_id): DataHandler.vectors() DataHandler.createDictionaries1() actorTagDataframe = DataHandler.actor_tag_df() movie_tag_df = DataHandler.load_movie_tag_df() movieid_name_map = DataHandler.movieid_name_map actorsTags = np.matrix(actorTagDataframe.as_matrix()).tolist() actorIndexList = list(actorTagDataframe.index) movieIndexList = list(movie_tag_df.index) movieTagMatrix = np.matrix(movie_tag_df.as_matrix()) if (movie_id not in movieIndexList): print("Movie " + movieid_name_map.get(movie_id) + " not present in mltags data. Quitting") return actorsForMovie = DataHandler.movie_actor_map.get(movie_id) simAndActor = [] movieInTags = movieTagMatrix[movieIndexList.index(movie_id)].tolist()[0] totalActors = len(actorIndexList) DataHandler.create_actor_actorid_map() for index in range(0, totalActors): actorId = actorIndexList[index] if (actorId in actorsForMovie): continue actorName = DataHandler.actor_actorid_map.get(actorId) actorinTags = actorsTags[index] comparisonScore = metrics.l2Norm(movieInTags, actorinTags) simAndActor.append((comparisonScore, actorName)) result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False) top10Actors = result[0:10] print("Top 10 actors similar to " + str(movieid_name_map.get(movie_id)) + " are: ") for tup in top10Actors: print(tup[1] + " : " + str(tup[0])) return
def similarMovieActor_LDA(givenMovie): DataHandler.vectors() DataHandler.createDictionaries1() DataHandler.create_actor_actorid_map() givenActor_similarity = defaultdict(float) actor_tag_dff = DataHandler.actor_tag_df() movie_tag_dff = DataHandler.load_movie_tag_df() actorTagMatrix = np.matrix(actor_tag_dff.as_matrix()) movieTagMatrix = np.matrix(movie_tag_dff.as_matrix()) movieid_name_map = DataHandler.movieid_name_map actorIndexList = list(actor_tag_dff.index) movieIndexList = list(movie_tag_dff.index) if (givenMovie not in movieIndexList): print("Movie " + movieid_name_map.get(givenMovie) + " not present in mltags data. Quitting") return movieInTags = movieTagMatrix[movieIndexList.index(givenMovie)] actorsForMovie = DataHandler.movie_actor_map.get(givenMovie) ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition( actor_tag_dff, 5, constants.actorTagsSpacePasses) for otherActor in actorIndexList: mo1 = DataHandler.representDocInLDATopics(movie_tag_dff, givenMovie, ldaModel) if otherActor not in actorsForMovie: ac2 = DataHandler.representDocInLDATopics(actor_tag_dff, otherActor, ldaModel) givenActor_similarity[otherActor] = ( metrics.simlarity_kullback_leibler(mo1, ac2)) #print(sorted(givenActor_similarity.items(),key = itemgetter(1),reverse=True)) top10 = sorted(givenActor_similarity.items(), key=itemgetter(1), reverse=False)[0:11] for actors in top10: print(DataHandler.actor_actorid_map.get(actors[0]), actors[1]) return
# -*- coding: utf-8 -*- from computations import decompositions from data import DataHandler from collections import defaultdict from operator import itemgetter from util import constants import numpy as np import operator from computations import metrics DataHandler.vectors() DataHandler.create_actor_actorid_map() def task1d(movie_id, method): if (method == "SVD"): task1dImplementation_SVD(movie_id) elif (method == "PCA"): task1d_pca(movie_id) elif (method == "LDA"): similarMovieActor_LDA(movie_id) elif (method == "TFIDF"): task1d_tfidf(movie_id) else: print("Invalid method. Please use SVD or PCA or LDA or TFIDF") return def task1dImplementation_SVD(movie_id): DataHandler.vectors() DataHandler.createDictionaries1()