def task1c_pca(actor_id):
    DataHandler.vectors()
    actorTagDataframe = DataHandler.actor_tag_df()
    actorTagMatrix = np.matrix(DataHandler.actor_tag_df().as_matrix())
    
    actorIndexList = list(actorTagDataframe.index)
    if (actor_id not in actorIndexList):
        print("Invalid actor id or no tags present for actor. Returning")
        return
    
    components = decompositions.PCADecomposition(actorTagDataframe, 5)
    
    #using transpose since according to page 158, p inverse = p transpose
    pMatrix = np.matrix(components).transpose()
    actorsInSemantics = (actorTagMatrix * pMatrix).tolist()
    
    simAndActor = [] 
    concernedActorInSemantics = actorsInSemantics[actorIndexList.index(actor_id)] 
    DataHandler.create_actor_actorid_map()
    
    for index in range(0, len(actorsInSemantics)):
        comparisonActorId = actorIndexList[index]
        if (actor_id == comparisonActorId):
            continue
        comparisonActorSemantics = actorsInSemantics[index]
        comparisonActorName = DataHandler.actor_actorid_map.get(comparisonActorId)
        simAndActor.append((metrics.l2Norm(concernedActorInSemantics, comparisonActorSemantics), comparisonActorName))
    
    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)
    
    top10Actors = result[0:10]
    print("Top 10 actors similar to " + str(DataHandler.actor_actorid_map.get(actor_id)) + " are: ")
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
def task1c_tfidf(actor_id):
    DataHandler.vectors()
    actorTagDataframe = DataHandler.actor_tag_df()
    actorsTags = np.matrix(actorTagDataframe.as_matrix()).tolist()
    actorIndexList = list(actorTagDataframe.index)

    if (actor_id not in actorIndexList):
        print("Invalid actor id or no tags present for actor. Returning")
        return
    
    simAndActor = []
    concernedActor = actorsTags[actorIndexList.index(actor_id)]
    totalActors = len(actorIndexList)
    DataHandler.create_actor_actorid_map()
    
    for index in range(0, totalActors):
        comparisonActorId = actorIndexList[index]
        if(actor_id == comparisonActorId):
            continue
        comparisonActorName = DataHandler.actor_actorid_map.get(comparisonActorId)
        comparisonActor = actorsTags[index]
        comparisonScore = metrics.l2Norm(concernedActor, comparisonActor)
        simAndActor.append((comparisonScore, comparisonActorName))
        
    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)
    
    top10Actors = result[0:10]
    print("Top 10 actors similar to " + str(DataHandler.actor_actorid_map.get(actor_id)) + " are: ")
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
def actor_task1c_SVD(actor_id):
    DataHandler.vectors()
    acdf = DataHandler.actor_tag_df()
    indexList=list(acdf.index)
    if (actor_id not in indexList):
        print("Invalid actor id or no tags present for actor. Returning")
        return
    U, Sigma, VT = decompositions.SVDDecomposition(acdf, 5)
    
    simAndActor = []
    actorInSemantics = U[indexList.index(actor_id)]
    DataHandler.create_actor_actorid_map()
    for index in range(0, len(U)):
        comparisonActorId = indexList[index]
        if (comparisonActorId == actor_id):
            continue
        actorName = DataHandler.actor_actorid_map.get(comparisonActorId)
        similarityScore = metrics.l2Norm(actorInSemantics, U[index])
        simAndActor.append((similarityScore, actorName))
    
    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)
    print("Top 10 Actors similar to " + str(DataHandler.actor_actorid_map.get(actor_id)) + " are:")
    top10Actors = result[0:10]
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
Пример #4
0
def task1dImplementation_SVD(movie_id):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    movieid_name_map = DataHandler.movieid_name_map

    actor_tag_df = DataHandler.actor_tag_df()
    movie_tag_df = DataHandler.load_movie_tag_df()

    moviesIndexList = list(movie_tag_df.index)
    actorsIndexList = list(actor_tag_df.index)
    actorsSize = len(actorsIndexList)

    if (movie_id not in moviesIndexList):
        print("Movie " + movieid_name_map.get(movie_id) +
              " not present in mltags data. Quitting")
        return

    actorU, actorSigma, actorV = decompositions.SVDDecomposition(
        actor_tag_df, 5)

    tagsToActorSemantics = (np.matrix(actorV)).transpose()
    movieTagMatrix = np.matrix(movie_tag_df.as_matrix())
    movieInTags = movieTagMatrix[moviesIndexList.index(movie_id)]
    movieInActorSemantics = (movieInTags * tagsToActorSemantics).tolist()[0]
    actorsInSemantics = np.matrix(actorU)

    actorsWithScores = []

    DataHandler.create_actor_actorid_map()
    actorsForMovie = DataHandler.movie_actor_map.get(movie_id)

    for index in range(0, actorsSize):
        actor_id = actorsIndexList[index]
        if actor_id in actorsForMovie:
            continue
        actorMatrix = actorsInSemantics[index]
        actor = (actorMatrix.tolist())[0]
        actorName = DataHandler.actor_actorid_map.get(actor_id)
        similarityScore = metrics.l2Norm(actor, movieInActorSemantics)
        actorsWithScores.append((similarityScore, actorName))

    resultActors = sorted(actorsWithScores,
                          key=operator.itemgetter(0),
                          reverse=False)
    top10Actors = resultActors[0:10]
    print("10 Actors similar to movie " + str(movieid_name_map.get(movie_id)) +
          " are: ")
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
Пример #5
0
def task1d_pca(movie_id):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    movieid_name_map = DataHandler.movieid_name_map

    actor_tag_df = DataHandler.actor_tag_df()
    movie_tag_df = DataHandler.load_movie_tag_df()

    actorTagMatrix = np.matrix(actor_tag_df.as_matrix())
    movieTagMatrix = np.matrix(movie_tag_df.as_matrix())

    actorIndexList = list(actor_tag_df.index)
    movieIndexList = list(movie_tag_df.index)
    if (movie_id not in movieIndexList):
        print("Movie " + movieid_name_map.get(movie_id) +
              " not present in mltags data. Quitting")
        return

    actorSemantics = decompositions.PCADecomposition(actor_tag_df, 5)

    actorP = np.matrix(actorSemantics).transpose()
    movieInTags = movieTagMatrix[movieIndexList.index(movie_id)]
    movieInActorSemantics = (movieInTags * actorP).tolist()[0]
    actorsInActorSemantics = (actorTagMatrix * actorP).tolist()

    DataHandler.create_actor_actorid_map()
    actorsForMovie = DataHandler.movie_actor_map.get(movie_id)

    DataHandler.create_actor_actorid_map()
    actorsSize = len(actorsInActorSemantics)
    simAndActor = []
    for index in range(0, actorsSize):
        actorId = actorIndexList[index]
        if (actorId in actorsForMovie):
            continue
        actorInSemantics = actorsInActorSemantics[index]
        actorName = DataHandler.actor_actorid_map.get(actorId)
        score = metrics.l2Norm(actorInSemantics, movieInActorSemantics)
        simAndActor.append((score, actorName))

    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)

    print("Top 10 actors similar to movie: " +
          str(movieid_name_map.get(movie_id)) + " are: ")
    top10Actors = result[0:10]
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
Пример #6
0
def task1d_tfidf(movie_id):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    actorTagDataframe = DataHandler.actor_tag_df()
    movie_tag_df = DataHandler.load_movie_tag_df()
    movieid_name_map = DataHandler.movieid_name_map

    actorsTags = np.matrix(actorTagDataframe.as_matrix()).tolist()
    actorIndexList = list(actorTagDataframe.index)
    movieIndexList = list(movie_tag_df.index)
    movieTagMatrix = np.matrix(movie_tag_df.as_matrix())

    if (movie_id not in movieIndexList):
        print("Movie " + movieid_name_map.get(movie_id) +
              " not present in mltags data. Quitting")
        return

    actorsForMovie = DataHandler.movie_actor_map.get(movie_id)
    simAndActor = []
    movieInTags = movieTagMatrix[movieIndexList.index(movie_id)].tolist()[0]
    totalActors = len(actorIndexList)
    DataHandler.create_actor_actorid_map()

    for index in range(0, totalActors):
        actorId = actorIndexList[index]
        if (actorId in actorsForMovie):
            continue
        actorName = DataHandler.actor_actorid_map.get(actorId)
        actorinTags = actorsTags[index]
        comparisonScore = metrics.l2Norm(movieInTags, actorinTags)
        simAndActor.append((comparisonScore, actorName))

    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)

    top10Actors = result[0:10]
    print("Top 10 actors similar to " + str(movieid_name_map.get(movie_id)) +
          " are: ")
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
Пример #7
0
def similarMovieActor_LDA(givenMovie):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    DataHandler.create_actor_actorid_map()
    givenActor_similarity = defaultdict(float)
    actor_tag_dff = DataHandler.actor_tag_df()
    movie_tag_dff = DataHandler.load_movie_tag_df()
    actorTagMatrix = np.matrix(actor_tag_dff.as_matrix())
    movieTagMatrix = np.matrix(movie_tag_dff.as_matrix())
    movieid_name_map = DataHandler.movieid_name_map

    actorIndexList = list(actor_tag_dff.index)
    movieIndexList = list(movie_tag_dff.index)

    if (givenMovie not in movieIndexList):
        print("Movie " + movieid_name_map.get(givenMovie) +
              " not present in mltags data. Quitting")
        return
    movieInTags = movieTagMatrix[movieIndexList.index(givenMovie)]
    actorsForMovie = DataHandler.movie_actor_map.get(givenMovie)

    ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
        actor_tag_dff, 5, constants.actorTagsSpacePasses)
    for otherActor in actorIndexList:
        mo1 = DataHandler.representDocInLDATopics(movie_tag_dff, givenMovie,
                                                  ldaModel)
        if otherActor not in actorsForMovie:
            ac2 = DataHandler.representDocInLDATopics(actor_tag_dff,
                                                      otherActor, ldaModel)
            givenActor_similarity[otherActor] = (
                metrics.simlarity_kullback_leibler(mo1, ac2))
    #print(sorted(givenActor_similarity.items(),key = itemgetter(1),reverse=True))
    top10 = sorted(givenActor_similarity.items(),
                   key=itemgetter(1),
                   reverse=False)[0:11]
    for actors in top10:
        print(DataHandler.actor_actorid_map.get(actors[0]), actors[1])
    return