def vectActMovTag():
    actors = di.getAllActors()
    tags = di.getAllTags()
    movies = di.getAllMovies()
    years = di.getAllYears()
    movYearsArray = di.getAllMovieYrs()
    movYears = {}
    for arr in movYearsArray:
        movYears[arr[0]] = arr[1]
    #print("movYears", movYears)
    actMoviesDb = {}
    for act in actors:
        actMovies = di.getActorMovieIds(act[0])
        actMov = []
        for mov in actMovies:
            actMov.append(mov[0])
        actMoviesDb[act[0]] = actMov
    vect = defaultdict(lambda: defaultdict(dict))
    for mov in movies:
        movTags = di.getMovieTagIds(mov[0])[0][0].split(",")
        #print(len(movTags))
        for act in actors:
            actMovies = actMoviesDb[act[0]]
            #print("actMovies:",actMovies)
            for tag in tags:
                #print("tag",tag[0])
                vect[mov[0]][act[0]][tag[0]] = 0
                #print("i am here")
                # Set the value to 1 if the given cond. is satisfied
                if ((mov[0] in actMovies) and
                    (tag[0] in movTags)):  #and (movYears[mov[0]] == yr[0])):
                    vect[act[0]][mov[0]][tag[0]] = movYears[mov[0]]
    #print(vect['1'])
    return (vect, actors, movies, years)
Пример #2
0
def movieTagSpace(movieId):
    tagIds = di.getMovieTags(movieId)
    tagLen = len(tagIds)
    actorlist = di.getAllActors()
    actorNames = di.getAllActorNames()
    idfActVector = idf.idfActorTag()
    mov = di.getMovieActorIds(movieId)
    movieActors = [0 for i in range(len(mov))]
    for i in range(len(mov)):
        movieActors[i] = mov[i][0]
    mat = [[0 for i in range(tagLen)] for j in range(len(movieActors))]
    newMat = [[0 for i in range(tagLen)] for j in range(len(actorlist))]
    for i in range(len(movieActors)):
        taglist = idf.tfIdfActorTag(movieActors[i], idfActVector)
        for j in range(tagLen):
            if (tagIds[j][0] in taglist.keys()):
                mat[i][j] = taglist[tagIds[j][0]]
    for i in range(0, len(actorlist)):
        if (actorlist[i][0] not in movieActors):
            taglist = idf.tfIdfActorTag(actorlist[i][0], idfActVector)
            for j in range(tagLen):
                if (tagIds[j][0] in taglist.keys()):
                    newMat[i][j] = taglist[tagIds[j][0]]
    actVect = [0 for i in range(tagLen)]
    for j in range(len(movieActors)):
        for i in range(tagLen):
            actVect[i] = actVect[i] + mat[j][i]
    dist = {}
    for i in range(len(newMat)):
        if (actorlist[i][0] not in movieActors):
            dist[actorNames[i][0]] = distance.euclidean(newMat[i], actVect)
    return utils.sortByValue(dist)[-10:]
def getCoactorMatrix():
    actorList = di.getAllActors()
    actLen = len(actorList)
    sim = np.zeros((actLen, actLen))
    actMovies = [0 for i in range(actLen)]
    for i in range(actLen):
        actMovies[i] = di.getActorMovieIds(actorList[i][0])
    for i in range(actLen):
        for j in range(actLen):
            set1 = set(actMovies[i])
            set2 = set(actMovies[j])
            sim[i][j] = len(set1 & set2)
    return sim
def getActorTagMatrix():
    tagIds = di.getAllTags()
    tagLen = len(tagIds)
    actorNames = di.getAllActorNames()
    actorlist = di.getAllActors()
    actorTags = np.zeros((len(actorlist), tagLen))
    i = 0
    idfActVector = idf.idfActorTag()
    for actor in actorlist:
        actVect = idf.tfIdfActorTag(actor[0], idfActVector)
        for j in range(tagLen):
            if (tagIds[j][0] in actVect.keys()):
                actorTags[i][j] = actVect[tagIds[j][0]]
        i += 1
    return actorTags
Пример #5
0
def idfActorTag():
    idfActVect = {}
    allTags = di.getAllTags()
    allActors = di.getAllActors()
    actorCount = len(allActors)
    for tag in allTags:
        tagCount = 0
        idfActVect[tag[0]] = 0
        for actor in allActors:
            tags = di.getActorTags(actor[0])
            if (tag[0] in tags[0]):
                tagCount = tagCount + 1
        if (tagCount != 0):
            idfActVect[tag[0]] = math.log(actorCount / tagCount)
    #print(idfActVect)
    return idfActVect
def formLdaMat(numSemantics):
	movies1 = tuple(movies)
	actors = di.getAllActors()
	actors1 = tuple(actors)
	actLen = len(actors)
	
	movAct = []
	for movie in movies1:
		arr = []
		for i in range(0,actLen):
			arr.append(0)
		acts = tuple(di.getMovieActorIds(movie[0]))
		for actor in actors1:
			for act in acts:
				if actor == act:
					arr[actors1.index(actor)] = 1
		movAct.append(arr)
	movAct = np.array(movAct)
	model = lda.LDA(n_topics = numSemantics, n_iter = 100, random_state = 1)
	model.fit(movAct)
	#component = model.components_
	return model.doc_topic_
def vectActMovYr():
    actors = di.getAllActors()
    movies = di.getAllMovies()
    years = di.getAllYears()
    movYearsArray = di.getAllMovieYrs()
    movYears = {}
    for arr in movYearsArray:
        movYears[arr[0]] = arr[1]
    #print("movYears", movYears)
    actMoviesDb = {}
    moviesArr = []
    for mov in movies:
        moviesArr.append(mov[0])
    for act in actors:
        actMovies = di.getActorMovieIds(act[0])
        actMov = []
        chk = 0
        for mov in actMovies:
            actMov.append(mov[0])
            if (mov[0] in moviesArr):
                chk = 1
        if (chk == 1):
            actMoviesDb[act[0]] = actMov
    print("\ngot the actor movies\n")
    vect = defaultdict(lambda: defaultdict(dict))
    for act in actMoviesDb:
        #print("movie set", act)
        for mov in moviesArr:
            #print("mov",mov)
            for yr in years:
                #print("yr",yr[0])
                vect[act][mov][yr[0]] = 0
                # Set the value to 1 if the given cond. is satisfied
                if ((mov in actMoviesDb[act]) and (movYears[mov] == yr[0])):
                    vect[act][mov][yr[0]] = 1
    #print(vect['1'])
    return (vect, actors, movies, years)
Пример #8
0
import dbInfo as di
import tfIdfCalc as idf
import numpy as np
import utils
import similarity
from scipy.spatial import distance

actorTags = similarity.getActorTagMatrix()
actorList = di.getAllActors()
actorNames = di.getAllActorNames()

def simActors(actId):
	for i in range(len(actorTags)):
		if(actId == actorList[i][0]):
			givenActor = actorTags[i]
	d = {}
	for i in range(len(actorList)):
		if(actId != actorList[i][0]):
			d[actorNames[i][0]] = distance.euclidean(givenActor,actorTags[i])
	return utils.sortByValue(d)[-10:]

def simActors2(actId):
	numSemantics = 5
	u,s,v = np.linalg.svd(actorTags,0)
	x=np.zeros((len(u),numSemantics))
	givenActor = np.zeros(numSemantics)
	for i in range(len(u)):
		for j in range(numSemantics):
			if(actId == actorList[i][0]):
				givenActor[j] = u[i][j]
			x[i][j] = u[i][j]
Пример #9
0
    di.delRows("movie_actor", "movie_id", mov)
    di.delRows("movie_info", "movie_id", mov)

allUsers = di.getAllUsers()
delUsers = []
for usr in allUsers:
    if (int(usr[0]) <= 71550):
        delUsers.append(usr[0])
print("delUsers", len(delUsers))
for usr in delUsers:
    di.delRows("mlratings", "user_id", usr)
    di.delRows("mltags", "user_id", usr)
    di.delRows("mlusers", "user_id", usr)
    print("usr ="******"actor = ", act[0])
        di.delRows("imdb_actor_info", "actor_id", act[0])
for tag in allTags:
    if (tag[0] not in mlTg):
        print("tag =", tag[0])
Пример #10
0
import dbInfo as db
import numpy as np
import svd
import utils
import similarity
#np.set_printoptions(threshold=np.nan)

actorTags = similarity.getActorTagMatrix()
mat = np.matmul(actorTags, np.transpose(actorTags))
svdSem = svd.svdCalc(mat, 3)
allActors = db.getAllActors()
actorNames = db.getAllActorNames()
print("\n\nActor-Actor similarity matrix:\n\n", mat, "\n\nsize of matrix :",
      mat.shape)
print("\n\nTop 3 Latent Semantics:\n")
for sem in svdSem:
    print("\n\n", utils.rankSem(sem, allActors))
groups = utils.form_groups_semantics(np.transpose(svdSem), actorNames, 3)
print("\n\n3 Non overlapping groups:")
for grp in groups.keys():
    print("\n\n", grp, ":", groups[grp])
Пример #11
0
import numpy as np
import utils
import persPageRank as ppr
import dbInfo as db
import similarity
#np.set_printoptions(threshold=np.nan)

actTags = similarity.getActorTagMatrix()
sim = np.matmul(actTags, np.transpose(actTags))
print("\n\nActor-Actor similarity matrix:\n", sim, "\n\nsize of matrix :",
      sim.shape)
seeds = input("\nGive Seed Actors: ").split(",")
actorNames = db.getAllActorNames()
actorIds = db.getAllActors()
seedNames = []
for i in range(len(actorIds)):
    if (actorIds[i][0] in seeds):
        seedNames.append(actorNames[i][0])
seedMat = ppr.formSeed(seeds, actorIds)
pprOut = ppr.personalizedPageRank(sim, seedMat, 0.85)
print("\n\n10 most related actors:\n")
for act in ppr.rankedList(pprOut, actorNames, seedNames, 10):
    print(act)