def vectActMovTag(): actors = di.getAllActors() tags = di.getAllTags() movies = di.getAllMovies() years = di.getAllYears() movYearsArray = di.getAllMovieYrs() movYears = {} for arr in movYearsArray: movYears[arr[0]] = arr[1] #print("movYears", movYears) actMoviesDb = {} for act in actors: actMovies = di.getActorMovieIds(act[0]) actMov = [] for mov in actMovies: actMov.append(mov[0]) actMoviesDb[act[0]] = actMov vect = defaultdict(lambda: defaultdict(dict)) for mov in movies: movTags = di.getMovieTagIds(mov[0])[0][0].split(",") #print(len(movTags)) for act in actors: actMovies = actMoviesDb[act[0]] #print("actMovies:",actMovies) for tag in tags: #print("tag",tag[0]) vect[mov[0]][act[0]][tag[0]] = 0 #print("i am here") # Set the value to 1 if the given cond. is satisfied if ((mov[0] in actMovies) and (tag[0] in movTags)): #and (movYears[mov[0]] == yr[0])): vect[act[0]][mov[0]][tag[0]] = movYears[mov[0]] #print(vect['1']) return (vect, actors, movies, years)
def movieTagSpace(movieId): tagIds = di.getMovieTags(movieId) tagLen = len(tagIds) actorlist = di.getAllActors() actorNames = di.getAllActorNames() idfActVector = idf.idfActorTag() mov = di.getMovieActorIds(movieId) movieActors = [0 for i in range(len(mov))] for i in range(len(mov)): movieActors[i] = mov[i][0] mat = [[0 for i in range(tagLen)] for j in range(len(movieActors))] newMat = [[0 for i in range(tagLen)] for j in range(len(actorlist))] for i in range(len(movieActors)): taglist = idf.tfIdfActorTag(movieActors[i], idfActVector) for j in range(tagLen): if (tagIds[j][0] in taglist.keys()): mat[i][j] = taglist[tagIds[j][0]] for i in range(0, len(actorlist)): if (actorlist[i][0] not in movieActors): taglist = idf.tfIdfActorTag(actorlist[i][0], idfActVector) for j in range(tagLen): if (tagIds[j][0] in taglist.keys()): newMat[i][j] = taglist[tagIds[j][0]] actVect = [0 for i in range(tagLen)] for j in range(len(movieActors)): for i in range(tagLen): actVect[i] = actVect[i] + mat[j][i] dist = {} for i in range(len(newMat)): if (actorlist[i][0] not in movieActors): dist[actorNames[i][0]] = distance.euclidean(newMat[i], actVect) return utils.sortByValue(dist)[-10:]
def getCoactorMatrix(): actorList = di.getAllActors() actLen = len(actorList) sim = np.zeros((actLen, actLen)) actMovies = [0 for i in range(actLen)] for i in range(actLen): actMovies[i] = di.getActorMovieIds(actorList[i][0]) for i in range(actLen): for j in range(actLen): set1 = set(actMovies[i]) set2 = set(actMovies[j]) sim[i][j] = len(set1 & set2) return sim
def getActorTagMatrix(): tagIds = di.getAllTags() tagLen = len(tagIds) actorNames = di.getAllActorNames() actorlist = di.getAllActors() actorTags = np.zeros((len(actorlist), tagLen)) i = 0 idfActVector = idf.idfActorTag() for actor in actorlist: actVect = idf.tfIdfActorTag(actor[0], idfActVector) for j in range(tagLen): if (tagIds[j][0] in actVect.keys()): actorTags[i][j] = actVect[tagIds[j][0]] i += 1 return actorTags
def idfActorTag(): idfActVect = {} allTags = di.getAllTags() allActors = di.getAllActors() actorCount = len(allActors) for tag in allTags: tagCount = 0 idfActVect[tag[0]] = 0 for actor in allActors: tags = di.getActorTags(actor[0]) if (tag[0] in tags[0]): tagCount = tagCount + 1 if (tagCount != 0): idfActVect[tag[0]] = math.log(actorCount / tagCount) #print(idfActVect) return idfActVect
def formLdaMat(numSemantics): movies1 = tuple(movies) actors = di.getAllActors() actors1 = tuple(actors) actLen = len(actors) movAct = [] for movie in movies1: arr = [] for i in range(0,actLen): arr.append(0) acts = tuple(di.getMovieActorIds(movie[0])) for actor in actors1: for act in acts: if actor == act: arr[actors1.index(actor)] = 1 movAct.append(arr) movAct = np.array(movAct) model = lda.LDA(n_topics = numSemantics, n_iter = 100, random_state = 1) model.fit(movAct) #component = model.components_ return model.doc_topic_
def vectActMovYr(): actors = di.getAllActors() movies = di.getAllMovies() years = di.getAllYears() movYearsArray = di.getAllMovieYrs() movYears = {} for arr in movYearsArray: movYears[arr[0]] = arr[1] #print("movYears", movYears) actMoviesDb = {} moviesArr = [] for mov in movies: moviesArr.append(mov[0]) for act in actors: actMovies = di.getActorMovieIds(act[0]) actMov = [] chk = 0 for mov in actMovies: actMov.append(mov[0]) if (mov[0] in moviesArr): chk = 1 if (chk == 1): actMoviesDb[act[0]] = actMov print("\ngot the actor movies\n") vect = defaultdict(lambda: defaultdict(dict)) for act in actMoviesDb: #print("movie set", act) for mov in moviesArr: #print("mov",mov) for yr in years: #print("yr",yr[0]) vect[act][mov][yr[0]] = 0 # Set the value to 1 if the given cond. is satisfied if ((mov in actMoviesDb[act]) and (movYears[mov] == yr[0])): vect[act][mov][yr[0]] = 1 #print(vect['1']) return (vect, actors, movies, years)
import dbInfo as di import tfIdfCalc as idf import numpy as np import utils import similarity from scipy.spatial import distance actorTags = similarity.getActorTagMatrix() actorList = di.getAllActors() actorNames = di.getAllActorNames() def simActors(actId): for i in range(len(actorTags)): if(actId == actorList[i][0]): givenActor = actorTags[i] d = {} for i in range(len(actorList)): if(actId != actorList[i][0]): d[actorNames[i][0]] = distance.euclidean(givenActor,actorTags[i]) return utils.sortByValue(d)[-10:] def simActors2(actId): numSemantics = 5 u,s,v = np.linalg.svd(actorTags,0) x=np.zeros((len(u),numSemantics)) givenActor = np.zeros(numSemantics) for i in range(len(u)): for j in range(numSemantics): if(actId == actorList[i][0]): givenActor[j] = u[i][j] x[i][j] = u[i][j]
di.delRows("movie_actor", "movie_id", mov) di.delRows("movie_info", "movie_id", mov) allUsers = di.getAllUsers() delUsers = [] for usr in allUsers: if (int(usr[0]) <= 71550): delUsers.append(usr[0]) print("delUsers", len(delUsers)) for usr in delUsers: di.delRows("mlratings", "user_id", usr) di.delRows("mltags", "user_id", usr) di.delRows("mlusers", "user_id", usr) print("usr ="******"actor = ", act[0]) di.delRows("imdb_actor_info", "actor_id", act[0]) for tag in allTags: if (tag[0] not in mlTg): print("tag =", tag[0])
import dbInfo as db import numpy as np import svd import utils import similarity #np.set_printoptions(threshold=np.nan) actorTags = similarity.getActorTagMatrix() mat = np.matmul(actorTags, np.transpose(actorTags)) svdSem = svd.svdCalc(mat, 3) allActors = db.getAllActors() actorNames = db.getAllActorNames() print("\n\nActor-Actor similarity matrix:\n\n", mat, "\n\nsize of matrix :", mat.shape) print("\n\nTop 3 Latent Semantics:\n") for sem in svdSem: print("\n\n", utils.rankSem(sem, allActors)) groups = utils.form_groups_semantics(np.transpose(svdSem), actorNames, 3) print("\n\n3 Non overlapping groups:") for grp in groups.keys(): print("\n\n", grp, ":", groups[grp])
import numpy as np import utils import persPageRank as ppr import dbInfo as db import similarity #np.set_printoptions(threshold=np.nan) actTags = similarity.getActorTagMatrix() sim = np.matmul(actTags, np.transpose(actTags)) print("\n\nActor-Actor similarity matrix:\n", sim, "\n\nsize of matrix :", sim.shape) seeds = input("\nGive Seed Actors: ").split(",") actorNames = db.getAllActorNames() actorIds = db.getAllActors() seedNames = [] for i in range(len(actorIds)): if (actorIds[i][0] in seeds): seedNames.append(actorNames[i][0]) seedMat = ppr.formSeed(seeds, actorIds) pprOut = ppr.personalizedPageRank(sim, seedMat, 0.85) print("\n\n10 most related actors:\n") for act in ppr.rankedList(pprOut, actorNames, seedNames, 10): print(act)