Пример #1
0
def task2b():
    DataHandler.vectors()
    DataHandler.createDictionaries1()

    coactor_similarity_df, actorList = DataHandler.coactor_siilarity_matrix()
    u, sigma, vt = decompositions.SVDDecomposition(coactor_similarity_df, 3)
    semantics = np.matrix(vt).tolist()

    actorIdActorsDf = DataHandler.actor_info_df
    actorsInDf = list(coactor_similarity_df.index)
    print("Top 3 semantics are:")

    for semantic in semantics:
        prettyPrintActorVector(semantic, actorsInDf, actorIdActorsDf)
        print("")

    split_group_with_index = formatter.splitGroup(u, 3)

    print("The three groupings are:")
    groups = tasksBusiness.get_partition_on_ids(split_group_with_index,
                                                actorIdActorsDf['name'])
    for x, v in groups.items():
        print('Group ' + str(x + 1) + ' : ' + str(v))
        print(" ")
    print()
def task1a_PCA(userId):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    movie_tag_df = DataHandler.load_movie_tag_df()
    u = decompositions.PCADimensionReduction(
        (movie_tag_df), 5)  #Assuming number of latent semantics are 5
    decpmposed = pd.DataFrame(u, index=movie_tag_df.index)
    similarity_df = DataHandler.movie_movie_Similarity1(decpmposed)
    movie_list = getWeightedSimilarityOrder(similarity_df, userId)

    user_movie_timestamp_map = DataHandler.user_rated_or_tagged_date_map
    list(DataHandler.user_rated_or_tagged_date_map[userId]).sort(
        key=lambda tup: tup[1])
    user_watched_movies = {}
    #Code to get the movies the user has already watched
    for user, movies in user_movie_timestamp_map.items():
        for i in user_movie_timestamp_map[user]:
            if user not in user_watched_movies:
                user_watched_movies[user] = [i[0]]
            else:
                user_watched_movies[user].append(i[0])

    movieid_name_map = DataHandler.movieid_name_map
    print('Movies similar to the following seed movies: ' +
          str([movieid_name_map.get(i) for i in user_watched_movies[userId]]))
    for i in range(0, len(movie_list[0])):
        print(movieid_name_map[movie_list[0][i]] + ': ' +
              str(list(movie_list[1])[i]))
Пример #3
0
def runme():
    global q_vector
    movieid_name_map = DataHandler.movieid_name_map
    enter_userid = 36  # input("UserID : ")
    userId = int(enter_userid)
    times = time.time()
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    loadBase(userId)
    # runDecomposition(loadPCASemantics)

    distances = runAllMethods()
    reco = [nonwatchedList[i] for i in distances][0:5]
    runAllMethodrelevancefeedback(reco, [1, 1, 1, 0, 0])
    new_query = q_vector
    movies = recommendMovies(new_query)
    named_movies = [movieid_name_map[i] for i in movies]
    print('Top 5 movies : ' + str(named_movies))
    while True:
        feedback = input("Relevance (1/0) for each of the 5 movies: ")
        if feedback == 'exit':
            print("GoodBye........")
            break
        feedback = [int(i) for i in feedback.split(',')]
        new_query = newQueryFromFeedBack(movies, feedback)
        print([movieid_name_map[nonwatchedList[i]] for i in new_query][0:5])
def task1a_svd(genre):
    DataHandler.vectors()
    DataHandler.createDictionaries1()

    genre_movie_map = DataHandler.getGenreMoviesMap()
    if (genre not in genre_movie_map.keys()):
        print("genre " + genre + " not present in data")
        return
    movie_tag_df = DataHandler.load_movie_tag_df()
    tagIdTagsDf = DataHandler.tag_id_df
    tagsInDf = list(movie_tag_df.transpose().index)

    movies = genre_movie_map.get(genre)
    genre_movie_tags_df = (movie_tag_df.loc[movies]).dropna(how='any')
    U, Sigma, genre_semantics = decompositions.SVDDecomposition(
        genre_movie_tags_df, 4)

    print("The 4 semantics for genre:" + genre + " are")
    index = 1
    for semantic in np.matrix(genre_semantics).tolist():
        print("semantic " + str(index) + ": ")
        prettyPrintTagVector(semantic, tagsInDf, tagIdTagsDf)
        print("")
        index = index + 1
    return
def PersnalizedPageRank_top5SimilarMovies1(userMovies):
    DataHandler.createDictionaries1()
    u = decompositions.CPDecomposition(
        DataHandler.getTensor_ActorMovieGenreYearRankRating(), 5)
    movies = sorted(list(DataHandler.movie_actor_map.keys()))
    u1 = u[1]
    movieNewDSpace = pd.DataFrame(u1, index=movies)
    movie_movie_similarity = DataHandler.movie_movie_Similarity1(
        movieNewDSpace)
    movieid_name_map = DataHandler.movieid_name_map
    alpha = constants.ALPHA
    movie_similarities = ppr.personalizedPageRank(movie_movie_similarity,
                                                  userMovies, alpha)
    movies = list(movie_movie_similarity.index)
    movieDF = pd.DataFrame(pd.Series(movies), columns=['movies'])
    movieDF['movies'] = movieDF['movies'].map(
        lambda x: movieid_name_map.get(x))
    Result = pd.concat([movie_similarities, movieDF], axis=1)
    sortedResult = Result.sort_values(by=0, ascending=False).head(15)
    seedmovieNames = [movieid_name_map.get(i) for i in userMovies]
    print('Movies similar to the following seed movies: ' +
          str(seedmovieNames))
    movie_genre_map = DataHandler.movie_genre_map
    genreForSeedMovies = [movie_genre_map.get(i) for i in userMovies]
    print('Genres for seed movies: ' + str(genreForSeedMovies))
    for index in sortedResult.index:
        if sortedResult.loc[index, 'movies'] not in seedmovieNames:
            print(sortedResult.loc[index, 'movies'] + ' ' +
                  str(sortedResult.loc[index, 0]) + ' ' +
                  str(movie_genre_map.get(movies[index])))
def task1_2Decompostions(func, userid):
    movieid_name_map = DataHandler.movieid_name_map
    enter_userid = userid  # input("UserID : ")
    userId = int(enter_userid)
    times = time.time()
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    rf.loadBase(userId)
    rf.runDecomposition(func)

    new_query = rf.q_vector
    movies, distances = rf.recommendMovies(new_query)
    named_movies = [movieid_name_map[i] for i in movies]
    print('---------------------')
    print('Top 5 movies : ')
    print(str(list(zip(named_movies, distances))))
    #for i in range(0, len(named_movies)):
    #   print(named_movies[i] + ", " + str(distances[i]))
    print("---------------------")
    while True:
        feedback = input("Relevance (1/0) for each of the 5 movies: ")
        if feedback == 'exit':
            print("Exit........")
            break
        feedback = [int(i) for i in feedback.split(',')]
        new_query, weights = rf.newQueryFromFeedBack(movies, feedback)
        # print(str(new_query) + "\n")
        print([movieid_name_map[rf.nonwatchedList[i]] for i in new_query][0:5])
def task1_2CombinedPredictor(userid):
    movieid_name_map = DataHandler.movieid_name_map
    enter_userid = userid  # input("UserID : ")
    userId = int(enter_userid)
    times = time.time()
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    rf.loadBase(userId)
    similarities, sortedSimilarity = rf.runAllMethods(userid)
    movies = [rf.nonwatchedList[i] for i in similarities][0:5]
    moviesWatched_timestamp = list(
        DataHandler.user_rated_or_tagged_date_map.get(userId))

    moviesWatched_timestamp = sorted(moviesWatched_timestamp,
                                     key=itemgetter(1))
    moviesWatched_timestamp_sorted = list(
        list(zip(*moviesWatched_timestamp))[0])
    watchedMovieNames = [
        movieid_name_map[movieid] for movieid in moviesWatched_timestamp_sorted
    ]
    print('-------------------------------------')
    print('Movies Watched by the user in order: ' + str(watchedMovieNames))
    named_movies = [movieid_name_map[i] for i in movies]
    print('Top 5 movies : ' + str(list(zip(named_movies, sortedSimilarity))))
    print('-------------------------------------')
    while True:
        feedback = input("Relevance (1/0) for each of the 5 movies: ")
        if feedback == 'exit':
            print("Exit........")
            break
        feedback = [int(i) for i in feedback.split(',')]
        new_query = rf.runAllMethodrelevancefeedback(movies, feedback)
        print([movieid_name_map[rf.nonwatchedList[i]] for i in new_query][0:5])
def genre_spaceActors_LDA_tf(genre):
    DataHandler.vectors()
    DataHandler.createDictionaries1()

    movie_tag_map, tag_id_map, actor_movie_rank_map, movie_actor_rank_map = DataHandler.get_dicts(
    )
    DataHandler.create_actor_actorid_map()
    actor_actorid_map = DataHandler.actor_actorid_map
    df = DataHandler.load_genre_actor_matrix_tf(genre)

    gmMap = DataHandler.genre_movie_map
    if (genre not in list(gmMap.keys())):
        print("genre " + genre + " not in data")
        return

    ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
        df, 4, constants.genreActorSpacePasses)
    topic_terms = defaultdict(set)
    for i in range(0, 4):
        for tuples in ldaModel.get_topic_terms(
                i, topn=len(actor_actorid_map)
        ):  #get_topics_terms returns top n(default = 10) words of the topics
            term = id_Term_map.get(tuples[0])
            topic_terms[i].add((actor_actorid_map.get(term), tuples[1]))
    for i in range(0, 4):
        print('Semantic ' + str(i + 1) + ' ' +
              str(sorted(topic_terms.get(i), key=itemgetter(1), reverse=True)))
        print('\n')
def task5_3():
    DataHandler.createDictionaries1()
    movieid_name_map = DataHandler.movieid_name_map
    movie_tag = DataHandler.load_movie_tag_df()

    # allMovieData = pd.DataFrame(DataHandler.load_dataForClassifiers(),index=list(movie_tag.index))
    allMovieData = pickle.load(
        open(constants.DIRECTORY + "movie_feature_df2", "rb"))
    train_movies_Matrix, train_label, train_movieids, test_movies_Matrix, test_movieids = createTrainTestData(
        allMovieData)
    uniqueLabels = list(set(train_label))
    for i in range(len(uniqueLabels)):
        labeli_index = [
            j for j, x in enumerate(train_label) if x == uniqueLabels[i]
        ]
        for k in labeli_index:
            train_label[k] = i
    svmModel = binarySVM.BinarySVM()
    svmModel.fit(train_movies_Matrix, train_label)
    predictions = [
        uniqueLabels[int(np.asscalar(i))]
        for i in svmModel.predict(test_movies_Matrix)
    ]
    test_movieids_names = [movieid_name_map[mid] for mid in test_movieids]
    print("Results for SVM classifier as (Movie Name, Label): \n" +
          str(list(zip(test_movieids_names, predictions))) + "\n")
def task1_2PageRank():
    userid = input("UserID : ")
    DataHandler.vectors()
    enter_userid = userid  # input("UserID : ")
    userId = int(enter_userid)
    DataHandler.createDictionaries1()
    rf.loadBase(userId)
    rf.task1d(userId)
Пример #11
0
def task3a(seed):
    DataHandler.createDictionaries1()
    actor_movie_rank_map = DataHandler.actor_movie_rank_map
    for s in seed:
        if s not in actor_movie_rank_map:
            print('Invalid seed actor id : ' + str(s))
            return
    tasksBusiness.PersnalizedPageRank_top10_SimilarActors(seed)
def top10_Actors_LDA_tf(givenActor):
    DataHandler.createDictionaries1()
    actor_movie_rank_map = DataHandler.actor_movie_rank_map
    if givenActor not in actor_movie_rank_map:
        print('Invalid seed actor id : '+str(givenActor))
        return
    DataHandler.create_actor_actorid_map()
    top10SimilarActors_similarity = DataHandler.similarActors_LDA_tf(givenActor)
    print('Actors similar to '+str(DataHandler.actor_actorid_map[givenActor]))
    for actor,sim in top10SimilarActors_similarity:
        print(DataHandler.actor_actorid_map[actor]+' '+str(sim))
    return
Пример #13
0
def top5SimilarMovies1(userMovies):
    DataHandler.createDictionaries1()
    u = decompositions.CPDecomposition(DataHandler.getTensor_ActorMovieGenreYearRankRating(),5)
    movies = sorted(list(DataHandler.movie_actor_map.keys()))
    u1= u[1]
    movieNewDSpace = pd.DataFrame(u1,index = movies)
    movie_movie_similarity = DataHandler.movie_movie_Similarity1(movieNewDSpace)
    movieid_name_map = DataHandler.movieid_name_map
    alpha = constants.ALPHA
    movie_similarities = pagerank.PPR(movie_movie_similarity,userMovies,alpha)
    print('Movies similar to the following seed movies: '+str([movieid_name_map.get(i) for i in userMovies]))
    for index,sim in movie_similarities:
        if (movie_movie_similarity.columns[index] not in userMovies):
            print(movieid_name_map.get(movie_movie_similarity.columns[index])+' '+ str(sim))
Пример #14
0
def task1dImplementation_SVD(movie_id):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    movieid_name_map = DataHandler.movieid_name_map

    actor_tag_df = DataHandler.actor_tag_df()
    movie_tag_df = DataHandler.load_movie_tag_df()

    moviesIndexList = list(movie_tag_df.index)
    actorsIndexList = list(actor_tag_df.index)
    actorsSize = len(actorsIndexList)

    if (movie_id not in moviesIndexList):
        print("Movie " + movieid_name_map.get(movie_id) +
              " not present in mltags data. Quitting")
        return

    actorU, actorSigma, actorV = decompositions.SVDDecomposition(
        actor_tag_df, 5)

    tagsToActorSemantics = (np.matrix(actorV)).transpose()
    movieTagMatrix = np.matrix(movie_tag_df.as_matrix())
    movieInTags = movieTagMatrix[moviesIndexList.index(movie_id)]
    movieInActorSemantics = (movieInTags * tagsToActorSemantics).tolist()[0]
    actorsInSemantics = np.matrix(actorU)

    actorsWithScores = []

    DataHandler.create_actor_actorid_map()
    actorsForMovie = DataHandler.movie_actor_map.get(movie_id)

    for index in range(0, actorsSize):
        actor_id = actorsIndexList[index]
        if actor_id in actorsForMovie:
            continue
        actorMatrix = actorsInSemantics[index]
        actor = (actorMatrix.tolist())[0]
        actorName = DataHandler.actor_actorid_map.get(actor_id)
        similarityScore = metrics.l2Norm(actor, movieInActorSemantics)
        actorsWithScores.append((similarityScore, actorName))

    resultActors = sorted(actorsWithScores,
                          key=operator.itemgetter(0),
                          reverse=False)
    top10Actors = resultActors[0:10]
    print("10 Actors similar to movie " + str(movieid_name_map.get(movie_id)) +
          " are: ")
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
Пример #15
0
def task1d_pca(movie_id):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    movieid_name_map = DataHandler.movieid_name_map

    actor_tag_df = DataHandler.actor_tag_df()
    movie_tag_df = DataHandler.load_movie_tag_df()

    actorTagMatrix = np.matrix(actor_tag_df.as_matrix())
    movieTagMatrix = np.matrix(movie_tag_df.as_matrix())

    actorIndexList = list(actor_tag_df.index)
    movieIndexList = list(movie_tag_df.index)
    if (movie_id not in movieIndexList):
        print("Movie " + movieid_name_map.get(movie_id) +
              " not present in mltags data. Quitting")
        return

    actorSemantics = decompositions.PCADecomposition(actor_tag_df, 5)

    actorP = np.matrix(actorSemantics).transpose()
    movieInTags = movieTagMatrix[movieIndexList.index(movie_id)]
    movieInActorSemantics = (movieInTags * actorP).tolist()[0]
    actorsInActorSemantics = (actorTagMatrix * actorP).tolist()

    DataHandler.create_actor_actorid_map()
    actorsForMovie = DataHandler.movie_actor_map.get(movie_id)

    DataHandler.create_actor_actorid_map()
    actorsSize = len(actorsInActorSemantics)
    simAndActor = []
    for index in range(0, actorsSize):
        actorId = actorIndexList[index]
        if (actorId in actorsForMovie):
            continue
        actorInSemantics = actorsInActorSemantics[index]
        actorName = DataHandler.actor_actorid_map.get(actorId)
        score = metrics.l2Norm(actorInSemantics, movieInActorSemantics)
        simAndActor.append((score, actorName))

    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)

    print("Top 10 actors similar to movie: " +
          str(movieid_name_map.get(movie_id)) + " are: ")
    top10Actors = result[0:10]
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
Пример #16
0
def Recommender(userId):
    DataHandler.createDictionaries1()
    movieRatedSeed = DataHandler.userMovieRatings(userId)
    
    
    actor_movie_rank_map = DataHandler.actor_movie_rank_map
    decomposed = decompositions.CPDecomposition(DataHandler.getTensor_ActorMovieGenre(),5)
    moviesList = sorted(list(DataHandler.movie_actor_rank_map.keys()))
    movie_movie_similarity = DataHandler.movie_movie_Similarity1(pd.DataFrame(decomposed[1],index=moviesList))
    prData = ppr.personalizedPageRankWeighted(movie_movie_similarity, movieRatedSeed, 0.9)
    rankedItems = sorted(list(map(lambda x:(moviesList[x[0]],x[1]),prData.itertuples())),key=lambda x:x[1], reverse=True)
    movieid_name_map = DataHandler.movieid_name_map

    seedmovieNames = [movieid_name_map[k] for k,y in movieRatedSeed]
    print("Movies similar to the users seed movies " + str(seedmovieNames) + " are:")
    return [(movieid_name_map[k],y) for (k,y) in rankedItems if k not in [k for k,y in movieRatedSeed]]
Пример #17
0
def PersnalizedPageRank_top10_SimilarCoActors(seed):
    DataHandler.createDictionaries1()
    DataHandler.create_actor_actorid_map()
    coactcoact, ignoreVariable = DataHandler.coactor_siilarity_matrix()
    actor_actorid_map = DataHandler.actor_actorid_map
    alpha = constants.ALPHA
    act_similarities = ppr.personalizedPageRank(coactcoact,seed,alpha)
    actors = list(coactcoact.index)
    actorDF = pd.DataFrame(pd.Series(actors),columns = ['Actor'])
    actorDF['Actor'] = actorDF['Actor'].map(lambda x:actor_actorid_map.get(x))
    Result = pd.concat([act_similarities,actorDF],axis = 1)
    sortedResult=Result.sort_values(by=0,ascending=False).head(15)
    seedAcotorNames = [actor_actorid_map.get(i) for i in seed]
    print('Co Actors similar to the following seed actors: '+str(seedAcotorNames))
    for index in sortedResult.index:
        if sortedResult.loc[index,'Actor'] not in seedAcotorNames:
            print(sortedResult.loc[index,'Actor']+' '+ str(sortedResult.loc[index,0]))
Пример #18
0
def task1d_tfidf(movie_id):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    actorTagDataframe = DataHandler.actor_tag_df()
    movie_tag_df = DataHandler.load_movie_tag_df()
    movieid_name_map = DataHandler.movieid_name_map

    actorsTags = np.matrix(actorTagDataframe.as_matrix()).tolist()
    actorIndexList = list(actorTagDataframe.index)
    movieIndexList = list(movie_tag_df.index)
    movieTagMatrix = np.matrix(movie_tag_df.as_matrix())

    if (movie_id not in movieIndexList):
        print("Movie " + movieid_name_map.get(movie_id) +
              " not present in mltags data. Quitting")
        return

    actorsForMovie = DataHandler.movie_actor_map.get(movie_id)
    simAndActor = []
    movieInTags = movieTagMatrix[movieIndexList.index(movie_id)].tolist()[0]
    totalActors = len(actorIndexList)
    DataHandler.create_actor_actorid_map()

    for index in range(0, totalActors):
        actorId = actorIndexList[index]
        if (actorId in actorsForMovie):
            continue
        actorName = DataHandler.actor_actorid_map.get(actorId)
        actorinTags = actorsTags[index]
        comparisonScore = metrics.l2Norm(movieInTags, actorinTags)
        simAndActor.append((comparisonScore, actorName))

    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)

    top10Actors = result[0:10]
    print("Top 10 actors similar to " + str(movieid_name_map.get(movie_id)) +
          " are: ")
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
def task1b_svd(genre):
    DataHandler.vectors()
    DataHandler.createDictionaries1()

    actorIdActorsDf = DataHandler.actor_info_df

    genre_actor_tags_df = DataHandler.load_genre_actor_matrix(genre)
    gmMap = DataHandler.genre_movie_map
    if (genre not in list(gmMap.keys())):
        print("genre " + genre + " not present in data\n")
        return
    actorsInDf = list(genre_actor_tags_df.transpose().index)
    genre_semantics = decompositions.PCADecomposition(genre_actor_tags_df, 4)

    print("The 4 semantics for genre:" + genre + " are")
    index = 1
    for semantic in np.matrix(genre_semantics).tolist():
        print("semantic " + str(index) + ": ")
        prettyPrintActorVector(semantic, actorsInDf, actorIdActorsDf)
        print("")
        index = index + 1
    return
Пример #20
0
def similarMovieActor_LDA(givenMovie):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    DataHandler.create_actor_actorid_map()
    givenActor_similarity = defaultdict(float)
    actor_tag_dff = DataHandler.actor_tag_df()
    movie_tag_dff = DataHandler.load_movie_tag_df()
    actorTagMatrix = np.matrix(actor_tag_dff.as_matrix())
    movieTagMatrix = np.matrix(movie_tag_dff.as_matrix())
    movieid_name_map = DataHandler.movieid_name_map

    actorIndexList = list(actor_tag_dff.index)
    movieIndexList = list(movie_tag_dff.index)

    if (givenMovie not in movieIndexList):
        print("Movie " + movieid_name_map.get(givenMovie) +
              " not present in mltags data. Quitting")
        return
    movieInTags = movieTagMatrix[movieIndexList.index(givenMovie)]
    actorsForMovie = DataHandler.movie_actor_map.get(givenMovie)

    ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
        actor_tag_dff, 5, constants.actorTagsSpacePasses)
    for otherActor in actorIndexList:
        mo1 = DataHandler.representDocInLDATopics(movie_tag_dff, givenMovie,
                                                  ldaModel)
        if otherActor not in actorsForMovie:
            ac2 = DataHandler.representDocInLDATopics(actor_tag_dff,
                                                      otherActor, ldaModel)
            givenActor_similarity[otherActor] = (
                metrics.simlarity_kullback_leibler(mo1, ac2))
    #print(sorted(givenActor_similarity.items(),key = itemgetter(1),reverse=True))
    top10 = sorted(givenActor_similarity.items(),
                   key=itemgetter(1),
                   reverse=False)[0:11]
    for actors in top10:
        print(DataHandler.actor_actorid_map.get(actors[0]), actors[1])
    return
def task1_2LDA():
    userid = input("UserID : ")
    movieid_name_map = DataHandler.movieid_name_map
    enter_userid = userid  # input("UserID : ")
    userid = int(enter_userid)
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    rf.loadBase(userid)
    finalWeights = rf.finalWeights

    movie_movie_similarity_subset_new = rf.runLDADecomposition(userid)  #update
    sim = list(
        movie_movie_similarity_subset_new.T.dot(finalWeights).astype(
            np.float32))
    movieList = list(movie_movie_similarity_subset_new.columns)
    simSorted = list(np.sort(sim)[::-1])[:5]
    simArgSorted = list(np.argsort(sim)[::-1])
    movies = [movieList[i] for i in simArgSorted][:5]
    named_movies = [movieid_name_map[movie] for movie in movies]
    watchedMovieNames = [
        movieid_name_map[movieid] for movieid in rf.moviesWatched
    ]
    print(watchedMovieNames)
    print("---------------------------------------------")
    print('Top 5 movies and their similarity scores: \n' +
          str(list(zip(named_movies, simSorted))) + "\n")
    wantFeedback = True
    while wantFeedback:
        feedbackWant = input("Would you like to give feedback 'Y'/'N': ")
        if feedbackWant == 'Y':
            LDAFeedback(movies)
            wantFeedback = True
        elif feedbackWant == 'N':
            wantFeedback = False
            break
        else:
            print("Invalid Input provided. Please try again.")
            wantFeedback = True
def task5_2():
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    movieid_name_map = DataHandler.movieid_name_map
    movie_tag = pd.read_pickle(constants.DIRECTORY + "movie_tag_df.pickle")
    # classifier_df = DataHandler.load_dataForClassifiers()
    # allMovieData = pd.DataFrame(classifier_df[1], index=list(movie_tag.index))
    allMovieData = pickle.load(
        open(constants.DIRECTORY + "movie_feature_df2",
             "rb"))  #DataHandler.moviemaker(list(movie_tag.index))
    train_movies_Matrix, train_label, train_movieids, test_movies_Matrix, test_movieids = createTrainTestData(
        allMovieData)

    uniqueLabels = list(set(train_label))
    for i in range(len(uniqueLabels)):
        labeli_index = [
            j for j, x in enumerate(train_label) if x == uniqueLabels[i]
        ]
        for k in labeli_index:
            train_label[k] = i


#    train_movies_Matrix=np.insert(train_movies_Matrix,train_movies_Matrix.shape[1]-1,train_label)
    train_movies_Matrix_DF = pd.DataFrame(train_movies_Matrix)
    train_movies_Matrix_DF['label'] = pd.Series(train_label)
    dtModel = DT.DecisionTree()
    dtModel.fit(
        train_movies_Matrix_DF[list(range(train_movies_Matrix.shape[1]))],
        train_movies_Matrix_DF['label'])
    predictions = [
        uniqueLabels[i]
        for i in dtModel.predict(pd.DataFrame(test_movies_Matrix))
    ]
    test_movieids_names = [movieid_name_map[mid] for mid in test_movieids]
    print("Results for Decision Tree classifier as (Movie Name, Label): \n" +
          str(list(zip(test_movieids_names, predictions))) + "\n")
def task5_1():
    classify = True
    while classify:
        r = input("Please enter the number of nearest neighbors 'r': ")
        if not r.isdigit():
            print(
                "A Non Integer was given as input. r should be a non zero positive integer. Please try again\n"
            )
            classify = True
            continue
        else:
            r = int(r)
            classify = False
        if r == 0:
            print(
                "0 was given as input. r should be a non zero positive integer. Please try again\n"
            )
            classify = True
            continue
    DataHandler.createDictionaries1()
    movieid_name_map = DataHandler.movieid_name_map
    allMovieData = DataHandler.load_movie_tag_df()
    train_movies_Matrix, train_label, train_movieids, test_movies_Matrix, test_movieids = createTrainTestData(
        allMovieData)
    trainSparseMatrix = sparse.csr_matrix(train_movies_Matrix)
    testSparseMatrix = sparse.csr_matrix(test_movies_Matrix)
    NNForAllTest = knn.NN(trainSparseMatrix, testSparseMatrix)
    maxKNNLabels = knn.sortAllNNAndGetLabels(NNForAllTest, r, train_label)
    predictions = [
        max(set(NNLabels[0:r]), key=NNLabels[0:r].count)
        for NNLabels in maxKNNLabels
    ]
    test_movieids_names = [movieid_name_map[mid] for mid in test_movieids]
    print(
        "Results for rNearestNeighbors classifier as (Movie Name, Label): \n" +
        str(list(zip(test_movieids_names, predictions))) + "\n")
def task1c(userId):
    global wt
    DataHandler.createDictionaries1()
    decomposed = decompositions.CPDecomposition(
        DataHandler.getTensor_ActorMovieGenre(), 5)
    moviesList = sorted(list(DataHandler.movie_actor_rank_map.keys()))
    movie_movie_similarity = DataHandler.movie_movie_Similarity1(
        pd.DataFrame(decomposed[1], index=moviesList))

    moviesWatched_timestamp = list(
        DataHandler.user_rated_or_tagged_date_map.get(userId))

    moviesWatched_timestamp = sorted(moviesWatched_timestamp,
                                     key=itemgetter(1))
    moviesWatched_timestamp_sorted = list(
        list(zip(*moviesWatched_timestamp))[0])
    resultMovies = getWeightedSimilarityOrder(movie_movie_similarity, userId)
    movieid_name_map = DataHandler.movieid_name_map
    resultMovieNames = [movieid_name_map[movieid] for movieid in resultMovies]
    watchedMovieNames = [
        movieid_name_map[movieid] for movieid in moviesWatched_timestamp_sorted
    ]
    print('Movies Watched by the user in order: ' + str(watchedMovieNames))
    print('Top 5 movies : ' + str(resultMovieNames))
def task3():
    #3.1
    DataHandler.createDictionaries1()
    movieid_name_map = DataHandler.movieid_name_map
    MoviesinLatentSpace = pd.read_csv(constants.DIRECTORY +
                                      'MoviesinLatentSpace_SVD_MDS.csv',
                                      index_col=0)
    SemanticsInTagsDf = pd.read_csv(constants.DIRECTORY +
                                    'MoviesinLatentSpace_SVD_MDS.csv',
                                    index_col=0)
    moviesList = list(MoviesinLatentSpace.index)
    MoviesinLatentSpace_Matrix = np.matrix(MoviesinLatentSpace,
                                           dtype=np.float32)
    print("Mapped all the movies to 500 dimensional space\n")
    d = len(MoviesinLatentSpace.columns)
    w = constants.W
    MoviesinLatentSpace_Matrix = np.matrix(MoviesinLatentSpace,
                                           dtype=np.float32)

    inputFile = pd.read_csv(constants.DIRECTORY + 'Task3_MovieIds.csv',
                            header=None)
    movieids_input = list(inputFile[0])
    num_moviesForIndexing = len(movieids_input)

    movieidsIndices_input = [moviesList.index(mid) for mid in movieids_input]
    MoviesinLatentSpace_Matrix_Input = MoviesinLatentSpace_Matrix[
        movieidsIndices_input]
    indexing = True
    while indexing:
        L = input("Please enter the number of Layers 'L': ")
        if not L.isdigit():
            print(
                "A Non Integer was given as input. L should be an integer.\n")
            indexing = True
            continue
        else:
            L = int(L)
        k = input("Please enter the number of hashes per layer 'k': ")
        if not k.isdigit():
            print(
                "A Non Integer was given as input. k should be an integer.\n")
            indexing = True
            continue
        else:
            k = int(k)

        print("Creating the index structure, considering " +
              str(num_moviesForIndexing) + " movies")
        #layerTables stores L*K random 'd' dimensional vectors and random offset values 'b'
        #LHashTables_result constains hashtables for each layer with keys provided by it's K hash functions and values as the movie indices
        layerTables, LHashTables_result = lsh.createAndGetLSH_IndexStructure(
            L, k, d, w, MoviesinLatentSpace_Matrix_Input)
        print("Index Structure Created\n")
        indexing = False

    reIndex = False
    doSearch = False
    exitVar = False
    takeUserInput = True
    while not exitVar:
        wantFeedback = True

        if takeUserInput:
            print("To Re-Index the index structure Press 'R'")
            print("To perform rNearestNeigbhor Search Press 'S'")
            print("To Exit Press 'X'")
            userInput = input("Your Response: ")
            if userInput == 'X':
                print("Exiting..")
                break
            elif userInput == "R":
                print("Re-Indexing..")
                reIndex = True
            elif userInput == "S":
                doSearch = True
            else:
                print("Invalid input. Please choose among the following: \n")
                takeUserInput = True
                continue

        if reIndex:
            reIndex = True
            while reIndex:
                L = input("Please enter the number of Layers 'L': ")
                if not L.isdigit():
                    print(
                        "A Non Integer was given as input. L should be an integer. Please try again\n"
                    )
                    reIndex = True
                    continue
                else:
                    L = int(L)
                k = input("Please enter the number of hashes per layer 'k': ")
                if not k.isdigit():
                    print(
                        "A Non Integer was given as input. k should be an integer. Please try again\n"
                    )
                    reIndex = True
                    continue
                else:
                    k = int(k)
                    reIndex = False
            print("Creating the index structure, considering " +
                  str(num_moviesForIndexing) + " movies")
            layerTables, LHashTables_result = lsh.createAndGetLSH_IndexStructure(
                L, k, d, w, MoviesinLatentSpace_Matrix_Input)
            print("Index Structure Created Again\n")
            reIndex = False
        if doSearch:
            doSearch = True
            while doSearch:
                movieid = input("Please enter a movieID: ")
                if not movieid.isdigit():
                    print(
                        "A Non Integer was given as input. movieid should be an integer. Please try again\n"
                    )
                    doSearch = True

                    #                    takeUserInput = False
                    #                    reIndex = False
                    continue
                else:
                    movieid = int(movieid)
                    doSearch = False
                if movieid not in MoviesinLatentSpace.index:
                    print(
                        "The given movieid does not exist. Please try again\n")
                    doSearch = True
                    #                    takeUserInput = False
                    #                    reIndex = False
                    continue
                r = input("Please enter the number of nearest neighbors 'r': ")
                if not r.isdigit():
                    print(
                        "A Non Integer was given as input. r should be a non zero positive integer. Please try again\n"
                    )
                    doSearch = True
                    #                    takeUserInput = False
                    #                    reIndex = False
                    continue
                else:
                    r = int(r)
                    doSearch = False
                if r == 0:
                    print(
                        "0 was given as input. r should be a non zero positive integer. Please try again\n"
                    )
                    doSearch = True
                    takeUserInput = False
                    reIndex = False
                    continue
            moviePoint = MoviesinLatentSpace_Matrix[moviesList.index(
                movieid)].astype(np.float32)
            nearestMovies, nearestMoviesBruteForce, nearestMoviesDistance, nearestMoviesDistanceBruteForce = rNearestNeighborSimilarMovies.getRNearestNeighbors(
                movieid, moviePoint, r, MoviesinLatentSpace, layerTables,
                LHashTables_result, movieidsIndices_input, movieids_input)
            nearestMoviesDistance, nearestMoviesDistanceBruteForce = list(
                np.array(nearestMoviesDistance)[0])[:r], list(
                    np.array(nearestMoviesDistanceBruteForce)[0])[:r]
            if len(nearestMovies) == 0:
                print(
                    "The LSH based index structure didn't map any other movie in the same buckets.\n"
                )
                continue
            if len(nearestMovies) != r:
                print(
                    "The LSH based index structure didn't map enough movies in the same buckets.\n"
                )
            nearestMoviesNames = [
                movieid_name_map[mid] for mid in nearestMovies
            ]
            nearestMoviesBruteForceNames = [
                movieid_name_map[mid] for mid in nearestMoviesBruteForce
            ]
            print("Movies Similar to '" + str(movieid_name_map[movieid]) +
                  "'\n")
            print(
                "Results based on the LSH based rNearestNeighbors and their distance scores: \n"
                + str(list(zip(nearestMoviesNames, nearestMoviesDistance))) +
                "\n")
            print(
                "Results based on Brute Force rNearestNeighbors and their distance scores: \n"
                + str(
                    list(
                        zip(nearestMoviesBruteForceNames,
                            nearestMoviesDistanceBruteForce))) + "\n")
            while wantFeedback:
                feedback = input("Would you like to give feedback 'Y'/'N': ")
                if feedback == 'Y':
                    task4(moviePoint, r, movieid, LHashTables_result,
                          MoviesinLatentSpace, layerTables, nearestMovies,
                          movieidsIndices_input, movieids_input,
                          SemanticsInTagsDf)
                    wantFeedback = True
                elif feedback == 'N':
                    wantFeedback = False
                else:
                    print("Invalid Input provided. Please try again.")
                    wantFeedback = True
            takeUserInput = True