示例#1
0
def loadDataset(filename, trainingSet=[] , testSet=[]):
	test=[]
	with open(filename, 'r') as csvfile:
		lines = csv.reader(csvfile)
		dataset = list(lines)
	movies = di.getAllMovies()
	tagIds = di.getAllTags()
	allTagLen = len(tagIds)
	dataset_copy = [['' for i in range(allTagLen+1)] for j in range(len(movies))]
	idfMovArr = idf.idfMovieTag()
	for i in range(len(dataset)):
		idfVect = idf.tfIdfMovieTag(dataset[i][0], idfMovArr)
		for j in range(len(idfVect)):
			dataset_copy[i][j] = idfVect[j]
		dataset_copy[i][allTagLen]=dataset[i][1]
		trainingSet.append(dataset_copy[i])
	train = [0 for i in range(len(dataset))]
	for i in range(len(dataset)):
		train[i] = int(dataset[i][0])
	k=0
	labels = ['0', '1']
	testset_copy = [['' for i in range(allTagLen+1)] for j in range(len(movies)-len(train))]
	for i in range(len(movies)):
			if(int(movies[i][0]) in train):
				pass
			else:
				test.append(movies[i][0])
				idfVect1 = idf.tfIdfMovieTag(movies[i][0], idfMovArr)
				for j in range(len(idfVect1)):
					testset_copy[k][j] = idfVect1[j]
				#testset_copy[k][allTagLen]=db.getMovieGenre(movies[i][0])[0]
				testset_copy[k][allTagLen]=random.choice(labels)
				testSet.append(testset_copy[k])
				k=k+1
	return test,trainingSet,testSet
def vectActMovTag():
    actors = di.getAllActors()
    tags = di.getAllTags()
    movies = di.getAllMovies()
    years = di.getAllYears()
    movYearsArray = di.getAllMovieYrs()
    movYears = {}
    for arr in movYearsArray:
        movYears[arr[0]] = arr[1]
    #print("movYears", movYears)
    actMoviesDb = {}
    for act in actors:
        actMovies = di.getActorMovieIds(act[0])
        actMov = []
        for mov in actMovies:
            actMov.append(mov[0])
        actMoviesDb[act[0]] = actMov
    vect = defaultdict(lambda: defaultdict(dict))
    for mov in movies:
        movTags = di.getMovieTagIds(mov[0])[0][0].split(",")
        #print(len(movTags))
        for act in actors:
            actMovies = actMoviesDb[act[0]]
            #print("actMovies:",actMovies)
            for tag in tags:
                #print("tag",tag[0])
                vect[mov[0]][act[0]][tag[0]] = 0
                #print("i am here")
                # Set the value to 1 if the given cond. is satisfied
                if ((mov[0] in actMovies) and
                    (tag[0] in movTags)):  #and (movYears[mov[0]] == yr[0])):
                    vect[act[0]][mov[0]][tag[0]] = movYears[mov[0]]
    #print(vect['1'])
    return (vect, actors, movies, years)
def getActorTagMatrix():
    tagIds = di.getAllTags()
    tagLen = len(tagIds)
    actorNames = di.getAllActorNames()
    actorlist = di.getAllActors()
    actorTags = np.zeros((len(actorlist), tagLen))
    i = 0
    idfActVector = idf.idfActorTag()
    for actor in actorlist:
        actVect = idf.tfIdfActorTag(actor[0], idfActVector)
        for j in range(tagLen):
            if (tagIds[j][0] in actVect.keys()):
                actorTags[i][j] = actVect[tagIds[j][0]]
        i += 1
    return actorTags
示例#4
0
def idfActorTag():
    idfActVect = {}
    allTags = di.getAllTags()
    allActors = di.getAllActors()
    actorCount = len(allActors)
    for tag in allTags:
        tagCount = 0
        idfActVect[tag[0]] = 0
        for actor in allActors:
            tags = di.getActorTags(actor[0])
            if (tag[0] in tags[0]):
                tagCount = tagCount + 1
        if (tagCount != 0):
            idfActVect[tag[0]] = math.log(actorCount / tagCount)
    #print(idfActVect)
    return idfActVect
示例#5
0
def idfUserTag():
    idfUserVect = {}
    allTags = di.getAllTags()
    allUsers = di.getAllUsers()
    userCount = len(allUsers)
    for tag in allTags:
        tagCount = 0
        idfUserVect[tag[0]] = 0
        for user in allUsers:
            tags = di.getUserTags(user[0])
            if (tag[0] in tags[0]):
                tagCount = tagCount + 1
        if (tagCount != 0):
            idfUserVect[tag[0]] = math.log(userCount / tagCount)
    #print(idfUserVect)
    return idfUserVect
示例#6
0
def idfGenreTag():
    idfGenVect = {}
    allTags = di.getAllTags()
    allGenres = di.getAllGenres()
    genreCount = len(allGenres)
    for tag in allTags:
        tagCount = 0
        idfGenVect[tag[0]] = 0
        for genre in allGenres:
            tags = di.getGenreTags(genre[0])
            if (tag[0] in tags[0]):
                tagCount = tagCount + 1
        if (tagCount != 0):
            idfGenVect[tag[0]] = math.log(genreCount / tagCount)
    #print(idfGenVect)
    return idfGenVect
def loadDataset(filename, trainingSet=[], testSet=[]):

    with open(filename, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
    labels = ['' for i in range(len(dataset))]
    movies = di.getAllMovies()
    tagIds = di.getAllTags()
    allTagLen = len(tagIds)
    dataset_copy = [['' for i in range(allTagLen)]
                    for j in range(len(dataset))]
    #dataset_copy = numpy.zeros((len(movies),allTagLen+1))
    #dataset_copy = [[0 for i in range(allTagLen+1)] for j in range(len(movies))]
    idfMovArr = idf.idfMovieTag()
    #print(idfMovArr)
    for i in range(len(dataset)):
        idfVect = idf.tfIdfMovieTag(dataset[i][0], idfMovArr)
        for j in range(len(idfVect)):
            dataset_copy[i][j] = idfVect[j]
        #dataset_copy[i][allTagLen]=dataset[i][1]
        labels[i] = dataset[i][1]
        trainingSet.append(dataset_copy[i])
    train = [0 for i in range(len(dataset))]

    target = ['' for i in range(len(movies))]
    for i in range(len(dataset)):
        train[i] = int(dataset[i][0])
    k = 0
    test = []
    label = ['0', '1']
    testset_copy = [['' for i in range(allTagLen)] for j in range(len(movies))]
    for i in range(len(movies)):
        if (int(movies[i][0]) in train):
            pass
        else:
            test.append(movies[i][0])
            idfVect1 = idf.tfIdfMovieTag(movies[i][0], idfMovArr)
            for j in range(len(idfVect1)):
                testset_copy[k][j] = idfVect1[j]
            #testset_copy[k][allTagLen]=di.getMovieGenre(movies[i][0])[0]
            #testset_copy[k][allTagLen]=random.choice(labels)
            target[k] = random.choice(label)
            testSet.append(testset_copy[k])
            k = k + 1
    #print("train data =",trainingSet)
    #print("\n\n test data =",testSet)
    return trainingSet, testSet, labels, target, test
def getGenreMovieTags(movie):
    tagIds = di.getAllTags()
    tagLen = len(tagIds)
    tfArray = [0 for i in range(tagLen)]
    unqTags = movie.getUnqTags()
    tags = movie.getTags()
    totalTags = len(tags)
    i = 0
    tfVect = {}
    for tagId in unqTags:
        tfFactor = 0
        for tag in tags:
            if (tag.getId() == tagId):
                tfFactor = tfFactor + tag.getTimeWeight()
        tfVect[tagId] = tfFactor / totalTags
    for i in range(tagLen):
        if (tagIds[i][0] in tfVect.keys()):
            tfArray[i] = tfVect[tagIds[i][0]]
    return tfArray
示例#9
0
def idfMovieTag():
    allTags = di.getAllTags()
    allMovies = di.getAllMovies()
    movieCount = len(allMovies)
    idfMovTagArr = np.zeros(len(allTags))
    movTags = []
    for mov in allMovies:
        movTags.append(di.getMovieTagIds(mov[0])[0][0].split(","))
    for i in range(len(allTags)):
        tagCount = 0
        for j in range(len(allMovies)):
            if (allTags[i][0] in movTags[j]):
                tagCount = tagCount + 1
        res = 0
        if (tagCount != 0):
            res = math.log(movieCount / tagCount)
        idfMovTagArr[i] = res
    #print(idfMovTagArr)
    return idfMovTagArr
def vectTagMovRat():
    tags = di.getAllTags()
    movies = di.getAllMovies()
    ratings = di.getAllRatings()
    avgRatingsArray = di.getAllMovieRtngs()
    avgRatings = {}
    for arr in avgRatingsArray:
        avgRatings[arr[0]] = arr[1]
    #print("avgRatings",avgRatings)
    vect = defaultdict(lambda: defaultdict(dict))
    for mov in movies:
        movTags = di.getMovieTagIds(mov[0])[0][0].split(",")
        for tag in tags:
            for rtng in ratings:
                vect[tag[0]][mov[0]][rtng[0]] = 0
                # Set the value to 1 if the given cond. is satisfied
                if ((tag[0] in movTags) and (rtng[0] <= avgRatings[mov[0]])):
                    vect[tag[0]][mov[0]][rtng[0]] = 1
    #print(vect['1'])
    return (vect, tags, movies, ratings)
movies = db.getAllMovies()
movieNames = db.getAllMovieNames()
tfmovies = {}
for movieId in movies:
    Taglist = db.getMovieTags(movieId[0])
    UnqTags = db.getMovieTagIds(movieId[0])[0][0].split(",")
    #print(UnqTags,movieId,Taglist)
    tfvect = {}
    for tag in UnqTags:
        tffact = 0
        for t in Taglist:
            if (t[0] == tag):
                tffact += 1
        tfvect[tag[0]] = tffact / len(Taglist)
    tfmovies[movieId[0]] = tfvect
tagids = db.getAllTags()
#print(tagids)
movietf = np.zeros((len(tfmovies), len(tagids)))
for i in range(len(tfmovies)):
    for j in range(len(tagids)):
        if (tagids[j][0] in tfmovies[movies[i][0]].keys()):
            movietf[i][j] = tfmovies[movies[i][0]][tagids[j][0]]
matrix = np.matmul(movietf, np.transpose(movietf))
seedList = db.getUserMoviesRates(userId)
seeds = []
for seed in seedList:
    seeds.append(seed[0])
seedNames = []
for i in range(len(movies)):
    if (movies[i][0] in seeds):
        seedNames.append(movieNames[i][0])
示例#12
0
    di.delRows("mltags", "movie_id", mov)
    di.delRows("movie_actor", "movie_id", mov)
    di.delRows("movie_info", "movie_id", mov)

allUsers = di.getAllUsers()
delUsers = []
for usr in allUsers:
    if (int(usr[0]) <= 71550):
        delUsers.append(usr[0])
print("delUsers", len(delUsers))
for usr in delUsers:
    di.delRows("mlratings", "user_id", usr)
    di.delRows("mltags", "user_id", usr)
    di.delRows("mlusers", "user_id", usr)
    print("usr ="******"actor = ", act[0])
        di.delRows("imdb_actor_info", "actor_id", act[0])
for tag in allTags:
    if (tag[0] not in mlTg):
示例#13
0
import dbInfo as db
import numpy as np
import utils
import tfCalc as tf
import warnings
warnings.filterwarnings("ignore")

allTags = db.getAllTags()
lenTags = len(allTags)


#this function will generate a Matrix to be used as input to SVD
def genSVDMatrix(genrelist):
    genObj = tf.createGenObj(genrelist)
    movies = genObj.getMovies()
    matrix = [[0 for x in range(0, lenTags)] for y in range(0, len(movies))]
    i = 0
    for movie in movies:
        matrix[i] = utils.getGenreMovieTags(movie)
        i += 1
    return matrix


def svdCalc(mat, numSem):
    U, s, V = np.linalg.svd(mat, full_matrices=False)
    sem = np.zeros((numSem, len(V[0])))
    for i in range(numSem):
        for j in range(len(V[0])):
            sem[i][j] = V[i][j]
    return sem
import dbInfo as di
import utils
import lda
import sys
from operator import itemgetter
import tensorDecomp as td
import persPageRank as ppr
import tfCalc as tf
import tfIdfCalc as idf
import numpy as np
from scipy.stats import mode

movies = di.getAllMovies()
tagIds = di.getAllTags()
allTagLen = len(tagIds)
movieLen = len(movies)

def formSvdMat(numSemantics):
	mat = np.zeros((movieLen,allTagLen))
	if(len(mat)<numSemantics or len(mat[0])<numSemantics):
		print("cant report top semantics")
		sys.exit()
	idfMovArr = idf.idfMovieTag()
	for i in range(movieLen):
		mat[i] = idf.tfIdfMovieTag(movies[i][0], idfMovArr)
	U, s, V = np.linalg.svd(mat,full_matrices=False)
	movieFacts = np.zeros((movieLen, numSemantics))
	for i in range(movieLen):
		for j in range(numSemantics):
			movieFacts[i][j] = U[i][j]
	return movieFacts