def main(): if len(sys.argv) < 2: print('Expected arguments are not provided.') return movieid = int(sys.argv[1]) mlmovies = util.read_mlmovies() movie_actors = util.read_movie_actor() imdb_actor_info = util.read_imdb_actor_info() input_movie = mlmovies[mlmovies['movieid'] == movieid]['moviename'].values[0] actors_of_movie = movie_actors.where(movie_actors['movieid']==movieid).dropna().loc[:,'actorid'].unique() #print (actors_of_movie) movie_matrix = util.get_movie_tf_idf_matrix() actor_matrix = util.get_actor_tf_idf_matrix() #print(actor_matrix.shape) input_movie_vector = pd.DataFrame(movie_matrix.loc[movieid])#.transpose() #print(input_movie_vector.shape) similarity_matrix = actor_matrix.dot(input_movie_vector) similarity_matrix = similarity_matrix[~similarity_matrix.index.isin(actors_of_movie)] #print(similarity_matrix) actors = [] for index, row in similarity_matrix.iterrows(): actor_name = imdb_actor_info[imdb_actor_info['id'] == index]['name'].values[0] actors.append((index, actor_name, similarity_matrix.loc[index][movieid])) actors.sort(key=lambda tup: tup[2], reverse=True) #print (actors) util.print_output(movieid, input_movie, actors[:no_of_actors]) util.write_output_file(movieid, input_movie, actors[:no_of_actors], output_file)
def main(): mlmovies = util.read_mlmovies() mlratings = util.read_mlratings() mltags = util.read_mltags() movies_list = mlmovies.movieid.unique() ratings_list = mlratings.rating.unique() tags_list = mltags.tagid.unique() average_movie_rating = mlratings.groupby( ['movieid'])['rating'].mean().reset_index() average_movie_rating = average_movie_rating.rename( columns={'rating': 'rating_avg'}) tag_movie_rating_grouped = mlmovies.merge( mlratings, on='movieid', how='inner').merge(mltags, on='movieid', how='inner').merge(average_movie_rating, on='movieid', how='inner') tag_movie_rating_tensor = [] for tag in tags_list: movie_rating_matrix = [] for movie in movies_list: movie_rating_list = [] for rating in ratings_list: if tag_movie_rating_grouped[ (tag_movie_rating_grouped.tagid == tag) & (tag_movie_rating_grouped.movieid == movie) & (tag_movie_rating_grouped.rating_avg <= rating)].empty: movie_rating_list.append(0.0) else: movie_rating_list.append(1.0) movie_rating_matrix.append(movie_rating_list) tag_movie_rating_tensor.append(movie_rating_matrix) cPickle.dump(tag_movie_rating_tensor, open("tag_movie_rating_tensor.pkl", "wb"))
def do_tensor(input_movie_ids): # read the pickle file that contains tensor actor_movie_year_3d_matrix = cPickle.load( open("actor_movie_genre_tensor.pkl", "rb")) actor_movie_year_array = np.array(actor_movie_year_3d_matrix) # perform cp decomposition decomposed = parafac(actor_movie_year_array, no_of_components, init='random') mlmovies = util.read_mlmovies() mlmovies = mlmovies.loc[mlmovies['year'] >= util.movie_year_for_tensor] movies_list = mlmovies.movieid.unique() # data frame for movie factor matrix from cp decomposition decomposed_movies_df = pd.DataFrame(decomposed[1], index=movies_list) # dataframe containing only input movies input_movie_df = decomposed_movies_df.loc[input_movie_ids] output_movies = {} # finding cosine similarity of each movie vector with the input movie vector and fetching the top 5 values for index, movie in decomposed_movies_df.iterrows(): cosine_sum = 0 order = 1 for j, input_movie in input_movie_df.iterrows(): cosine_sum += (1 - cosine(movie, input_movie)) * order order -= order_factor output_movies[index] = cosine_sum return output_movies, decomposed_movies_df, movies_list
def main(): mlmovies = util.read_mlmovies() imdb_actor_info = util.read_imdb_actor_info() movie_actor = util.read_movie_actor() movies_list = mlmovies.movieid.unique() year_list = mlmovies.year.unique() actor_list = imdb_actor_info.id.unique() movie_year_matrix = [] actor_movie_year_grouped = pd.merge(movie_actor, mlmovies, on=['movieid','movieid'], how='inner') actor_movie_year_tensor = [] count=0 for actor in actor_list: movie_year_matrix = [] for movie in movies_list: movie_year_list = [] for year in year_list: if actor_movie_year_grouped[(actor_movie_year_grouped.actorid == actor) & (actor_movie_year_grouped.movieid == movie) & (actor_movie_year_grouped.year == year)].empty: movie_year_list.append(0.0) else: movie_year_list.append(1.0) movie_year_matrix.append(movie_year_list) actor_movie_year_tensor.append(movie_year_matrix) cPickle.dump( actor_movie_year_tensor, open( "actor_movie_year_tensor.pkl", "wb" ) )
def main(): # load the required data from csv files mlmovies = util.read_mlmovies() mlmovies = mlmovies.loc[mlmovies['year'] >= util.movie_year_for_tensor] movie_actor = util.read_movie_actor() movies_list = mlmovies.movieid.unique() movie_actor = movie_actor.loc[movie_actor['movieid'].isin(movies_list)] actor_list = movie_actor.actorid.unique() # split the '|' separated genre values and stack them on separate columns movie_genre = pd.DataFrame(mlmovies.genres.str.split('|').tolist(), index=mlmovies.movieid).stack() movie_genre = movie_genre.reset_index()[[0, 'movieid']] movie_genre.columns = ['genres', 'movieid'] genres = movie_genre.genres.unique().tolist() # merge the actor and movie details actor_movie_grouped = pd.merge(movie_actor, mlmovies, on=['movieid', 'movieid'], how='inner') # merge genre with the actor and movie details actor_movie_genre_grouped = pd.merge(actor_movie_grouped, movie_genre, on=['movieid', 'movieid'], how='inner') actor_movie_genre_grouped.sort_values(['actorid', 'movieid'], ascending=[1, 1]) actor_movie_genre_tensor = [None] * len(actor_list) actor_dict = {} for i in range(0, len(actor_list)): actor_dict[actor_list[i]] = i movie_dict = {} for i in range(0, len(movies_list)): movie_dict[movies_list[i]] = i genre_dict = {} for i in range(0, len(genres)): genre_dict[genres[i]] = i # initialize the thensor with 0 value for all index actor_movie_genre_tensor = [None] * len(actor_list) for i in range(0, len(actor_list)): actor_movie_genre_tensor[i] = [None] * len(movies_list) for j in range(0, len(movies_list)): actor_movie_genre_tensor[i][j] = [None] * len(genres) for k in range(0, len(genres)): actor_movie_genre_tensor[i][j][k] = 0 # update the index values of actor, movie, genre combination with 1 for index, row in actor_movie_genre_grouped.iterrows(): a_id = row['actorid'] m_id = row['movieid'] g_id = row['genres_y'] actor_movie_genre_tensor[actor_dict[a_id]][movie_dict[m_id]][ genre_dict[g_id]] = 1 cPickle.dump(actor_movie_genre_tensor, open("actor_movie_genre_tensor.pkl", "wb"))
def partition_components(decomposed): mltags = util.read_mltags() genome_tags = util.read_genome_tags() mltags = pd.merge(mltags, genome_tags, left_on='tagid', right_on='tagId', how='inner') tags_list = mltags.tagid.unique() tags_list = genome_tags[genome_tags['tagId'].isin( tags_list)]['tag'].tolist() mlmovies = util.read_mlmovies() movies_list = mlmovies.movieid.unique() movies_list = mlmovies[mlmovies['movieid'].isin( movies_list)]['moviename'].tolist() mlratings = util.read_mlratings() ratings_list = mlratings.rating.unique() partitions = { 1: { 'tag': [], 'movie': [], 'rating': [] }, 2: { 'tag': [], 'movie': [], 'rating': [] }, 3: { 'tag': [], 'movie': [], 'rating': [] }, 4: { 'tag': [], 'movie': [], 'rating': [] }, 5: { 'tag': [], 'movie': [], 'rating': [] } } for j, tag_vec in enumerate(decomposed[0]): partition_num = np.argmax(tag_vec) + 1 partitions[partition_num]['tag'].append(tags_list[j]) for j, movie_vec in enumerate(decomposed[1]): partition_num = np.argmax(movie_vec) + 1 partitions[partition_num]['movie'].append(movies_list[j]) for j, rating_vec in enumerate(decomposed[2]): partition_num = np.argmax(rating_vec) + 1 partitions[partition_num]['rating'].append(ratings_list[j]) util.print_partition(partitions) util.write_partition_output_file(partitions, output_file)
def latent_year_semantics(year_matrix): mlmovies = util.read_mlmovies() year_list = mlmovies.year.unique() concepts = [] for i in range(no_of_components): concept = [] for j, component in enumerate(np.transpose(year_matrix)[i]): concept.append((year_list[j], component)) concept.sort(key=lambda tup: abs(tup[1]), reverse=True) concepts.append(concept) util.print_output(concepts, 'Year') util.write_output_file(concepts, output_file, 'Year')
def latent_movie_semantics(movie_matrix): mlmovies = util.read_mlmovies() movies_list = mlmovies.movieid.unique() movies_list = mlmovies[mlmovies['movieid'].isin( movies_list)]['moviename'].tolist() concepts = [] for i in range(no_of_components): concept = [] for j, component in enumerate(np.transpose(movie_matrix)[i]): concept.append((movies_list[j], component)) concept.sort(key=lambda tup: abs(tup[1]), reverse=True) concepts.append(concept) util.print_output(concepts, 'Movie') util.write_output_file(concepts, output_file, 'Movie')
def partition_components(decomposed): imdb_actor_info = util.read_imdb_actor_info() actor_list = imdb_actor_info.id.unique() imdb_actor_info = util.read_imdb_actor_info() actor_list = imdb_actor_info[imdb_actor_info['id'].isin( actor_list)]['name'].tolist() mlmovies = util.read_mlmovies() movies_list = mlmovies.movieid.unique() movies_list = mlmovies[mlmovies['movieid'].isin( movies_list)]['moviename'].tolist() year_list = mlmovies.year.unique() partitions = { 1: { 'actor': [], 'movie': [], 'year': [] }, 2: { 'actor': [], 'movie': [], 'year': [] }, 3: { 'actor': [], 'movie': [], 'year': [] }, 4: { 'actor': [], 'movie': [], 'year': [] }, 5: { 'actor': [], 'movie': [], 'year': [] } } for j, actor_vec in enumerate(decomposed[0]): partition_num = np.argmax(actor_vec) + 1 partitions[partition_num]['actor'].append(actor_list[j]) for j, movie_vec in enumerate(decomposed[1]): partition_num = np.argmax(movie_vec) + 1 partitions[partition_num]['movie'].append(movies_list[j]) for j, year_vec in enumerate(decomposed[2]): partition_num = np.argmax(year_vec) + 1 partitions[partition_num]['year'].append(year_list[j]) util.print_partition(partitions) util.write_partition_output_file(partitions, output_file)
def main(): err, input_movie_ids = util.parse_input(sys.argv) if err: return # read the pickle file that contains tensor actor_movie_year_3d_matrix = cPickle.load( open("actor_movie_genre_tensor.pkl", "rb")) actor_movie_year_array = np.array(actor_movie_year_3d_matrix) # perform cp decomposition decomposed = parafac(actor_movie_year_array, no_of_components, init='random') mlmovies = util.read_mlmovies() mlmovies = mlmovies.loc[mlmovies['year'] >= util.movie_year_for_tensor] movies_list = mlmovies.movieid.unique() # data frame for movie factor matrix from cp decomposition decomposed_movies_df = pd.DataFrame(decomposed[1], index=movies_list) # dataframe containing only input movies input_movie_df = decomposed_movies_df.loc[input_movie_ids] output_movies = [] # finding cosine similarity of each movie vector with the input movie vector and fetching the top 5 values for index, movie in decomposed_movies_df.iterrows(): cosine_sum = 0 order = 1 for j, input_movie in input_movie_df.iterrows(): cosine_sum += (1 - cosine(movie, input_movie)) * order order -= order_factor output_movies.append((index, cosine_sum)) other_movies = list( filter(lambda tup: tup[0] not in input_movie_ids, output_movies)) other_movies.sort(key=lambda tup: tup[1], reverse=True) output_movie_ids = [t[0] for t in other_movies][:5] #print output and log them feedback = util.process_output(input_movie_ids, output_movie_ids, output_file) #process feedback to get relevant movies and movies to be excluded relevant_movies, movie_to_exclude = util.process_feedback( feedback, input_movie_ids) relevant_movie_count = len(relevant_movies) #if all recommended movies are relevant then return if relevant_movie_count == 5: print "\nAll the movies were relevant hence no modification to the suggestion" return #fetch data frames for relevant and feedback movies relevant_movies_df = decomposed_movies_df.loc[relevant_movies] feedback_movies_df = decomposed_movies_df.loc[feedback.keys()] modified_query = util.probabilistic_feedback_query(feedback_movies_df, relevant_movies_df, movies_list, relevant_movie_count) revised_movie_ids = util.get_revised_movies(decomposed_movies_df, modified_query, movie_to_exclude) util.print_revised(revised_movie_ids, output_file)