class ActorTag(object): """ Class to relate actors and tags. """ def __init__(self): """ Initializing the data extractor object to get data from the csv files """ self.data_set_loc = conf.config_section_mapper("filePath").get( "data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) def assign_idf_weight(self, data_series, unique_tags): """ This function computes the idf weight for all tags in a data frame, considering each movie as a document :param data_frame: :param unique_tags: :return: dictionary of tags and idf weights """ idf_counter = {tag: 0 for tag in unique_tags} for tag_list in data_series: for tag in tag_list: idf_counter[tag] += 1 for tag, count in list(idf_counter.items()): idf_counter[tag] = math.log(len(data_series.index) / count, 2) return idf_counter def assign_tf_weight(self, tag_series): """ This function computes the tf weight for all tags for a movie :param tag_series: :return: dictionary of tags and tf weights """ counter = Counter() for each in tag_series: counter[each] += 1 total = sum(counter.values()) for each in counter: counter[each] = (counter[each] / total) return dict(counter) def assign_rank_weight(self, data_frame): """ This function assigns a value for all the actors in a movie on a scale of 100, based on their rank in the movie. :param tag_series: :return: dictionary of (movieid, actor_rank) to the computed rank_weight """ groupby_movies = data_frame.groupby("movieid") movie_rank_weight_dict = {} for movieid, info_df in groupby_movies: max_rank = info_df.actor_movie_rank.max() for rank in info_df.actor_movie_rank.unique(): movie_rank_weight_dict[( movieid, rank)] = (max_rank - rank + 1) / max_rank * 100 return movie_rank_weight_dict def get_model_weight(self, tf_weight_dict, idf_weight_dict, rank_weight_dict, tag_df, model): """ This function combines tf_weight on a scale of 100, idf_weight on a scale of 100, actor_rank for each tag on scale of 100 and timestamp_weight on a scale of 10 , based on the model. :param tf_weight_dict, idf_weight_dict, rank_weight_dict, tag_df, model :return: data_frame with column of the combined weight """ if model == "TF": tag_df["value"] = pd.Series( [(tf_weight_dict.get(movieid, 0).get(tag, 0) * 100) + rank_weight_dict.get((movieid, rank), 0) for index, ts_weight, tag, movieid, rank in zip( tag_df.index, tag_df.timestamp_weight, tag_df.tag, tag_df.movieid, tag_df.actor_movie_rank)], index=tag_df.index) else: tag_df["value"] = pd.Series( [(ts_weight + (tf_weight_dict.get(movieid, 0).get(tag, 0) * (idf_weight_dict.get(tag, 0)) * 100) + rank_weight_dict.get( (movieid, rank), 0)) for index, ts_weight, tag, movieid, rank in zip( tag_df.index, tag_df.timestamp_weight, tag_df.tag, tag_df.movieid, tag_df.actor_movie_rank)], index=tag_df.index) return tag_df def combine_computed_weights(self, data_frame, rank_weight_dict, idf_weight_dict, model): """ Triggers the weighing process and sums up all the calculated weights for each tag :param data_frame: :param rank_weight_dict: :param model: :return: dictionary of tags and weights """ tag_df = data_frame.reset_index() temp_df = tag_df.groupby( ['movieid'])['tag'].apply(lambda x: ','.join(x)).reset_index() movie_tag_dict = dict(zip(temp_df.movieid, temp_df.tag)) tf_weight_dict = { movie: self.assign_tf_weight(tags.split(',')) for movie, tags in list(movie_tag_dict.items()) } tag_df = self.get_model_weight(tf_weight_dict, idf_weight_dict, rank_weight_dict, tag_df, model) tag_df["total"] = tag_df.groupby(['tag'])['value'].transform('sum') tag_df = tag_df.drop_duplicates("tag").sort_values("total", ascending=False) actor_tag_dict = dict(zip(tag_df.tag, tag_df.total)) return actor_tag_dict def merge_movie_actor_and_tag(self, actorid, model): """ Merges data from different csv files necessary to compute the tag weights for each actor, assigns weights to timestamp. :param actorid: :param model: :return: returns a dictionary of Actors to dictionary of tags and weights. """ mov_act = self.data_extractor.get_movie_actor_data() ml_tag = self.data_extractor.get_mltags_data() genome_tag = self.data_extractor.get_genome_tags_data() actor_info = self.data_extractor.get_imdb_actor_info_data() actor_movie_info = mov_act.merge(actor_info, how="left", left_on="actorid", right_on="id") tag_data_frame = ml_tag.merge(genome_tag, how="left", left_on="tagid", right_on="tagId") merged_data_frame = actor_movie_info.merge(tag_data_frame, how="left", on="movieid") merged_data_frame = merged_data_frame[ merged_data_frame['timestamp'].notnull()] merged_data_frame = merged_data_frame.drop(["userid"], axis=1) rank_weight_dict = self.assign_rank_weight( merged_data_frame[['movieid', 'actor_movie_rank']]) merged_data_frame = merged_data_frame.sort_values( "timestamp", ascending=True).reset_index() data_frame_len = len(merged_data_frame.index) merged_data_frame["timestamp_weight"] = pd.Series( [(index + 1) / data_frame_len * 10 for index in merged_data_frame.index], index=merged_data_frame.index) if model == 'TFIDF': idf_weight_dict = self.assign_idf_weight( merged_data_frame.groupby('movieid')['tag'].apply(set), merged_data_frame.tag.unique()) tag_dict = self.combine_computed_weights( merged_data_frame[merged_data_frame['actorid'] == actorid], rank_weight_dict, idf_weight_dict, model) else: tag_dict = self.combine_computed_weights( merged_data_frame[merged_data_frame['actorid'] == actorid], rank_weight_dict, {}, model) return tag_dict
class Util(object): """ Class containing all the common utilities used across the entire code base """ def __init__(self): self.conf = ParseConfig() self.data_set_loc = os.path.join(os.path.abspath(os.path.dirname(__file__)), self.conf.config_section_mapper("filePath").get("data_set_loc")) self.data_extractor = DataExtractor(self.data_set_loc) self.mlratings = self.data_extractor.get_mlratings_data() self.mlmovies = self.data_extractor.get_mlmovies_data() self.imdb_actor_info = self.data_extractor.get_imdb_actor_info_data() self.genome_tags = self.data_extractor.get_genome_tags_data() def get_sorted_actor_ids(self): """ Obtain sorted actor ids :return: list of sorted actor ids """ actor_info = self.data_extractor.get_imdb_actor_info_data() actorids = actor_info.id actorids = actorids.sort_values() return actorids def get_movie_id(self, movie): """ Obtain name ID for the name passed as input :param movie: :return: movie id """ all_movie_data = self.mlmovies movie_data = all_movie_data[all_movie_data['moviename'] == movie] movie_id = movie_data['movieid'].unique() return movie_id[0] def get_average_ratings_for_movie(self, movie_id): """ Obtain average rating for movie :param movie_id: :return: average movie rating """ all_ratings = self.mlratings movie_ratings = all_ratings[all_ratings['movieid'] == movie_id] ratings_sum = 0 ratings_count = 0 for index, row in movie_ratings.iterrows(): ratings_count += 1 ratings_sum += row['rating'] return ratings_sum / float(ratings_count) def get_actor_name_for_id(self, actor_id): """ actor name for id :param actor_id: :return: actor name for the actor id """ actor_data = self.imdb_actor_info[self.imdb_actor_info['id'] == actor_id] name = actor_data['name'].unique() return name[0] def get_movie_name_for_id(self, movieid): """ movie name for movie id :param movieid: :return: movie name """ all_movie_data = self.mlmovies movie_data = all_movie_data[all_movie_data['movieid'] == movieid] movie_name = movie_data['moviename'].unique() return movie_name[0] def get_tag_name_for_id(self, tag_id): """ tag name for tag id :param tag_id: :return: tag name """ tag_data = self.genome_tags[self.genome_tags['tagId'] == tag_id] name = tag_data['tag'].unique() return name[0] def partition_factor_matrix(self, matrix, no_of_partitions, entity_names): """ Function to partition the factor matrix into groups as per 2-norm distance :param matrix: :param no_of_partitions: :param entity_names: :return: dictionary containing the groups """ entity_dict = {} for i in range(0, len(matrix)): length = 0 for latent_semantic in matrix[i]: length += abs(latent_semantic) ** 2 entity_dict[entity_names[i]] = math.sqrt(length) max_length = float(max(entity_dict.values())) min_length = float(min(entity_dict.values())) length_of_group = (float(max_length) - float(min_length)) / float(no_of_partitions) groups = {} for i in range(0, no_of_partitions): groups["Group " + str(i + 1) + " ( " + str(min_length + float(i * length_of_group)) + " , " + str( min_length + float((i + 1) * length_of_group)) + " )"] = [] for key in entity_dict.keys(): entity_length = entity_dict[key] group_no = math.ceil(float(entity_length - min_length) / float(length_of_group)) if group_no == 0: group_no = 1 groups["Group " + str(group_no) + " ( " + str( min_length + float((group_no - 1) * length_of_group)) + " , " + str( min_length + float(group_no * length_of_group)) + " )"].append(key) return groups def get_latent_semantics(self, r, matrix): """ Function to obtain the latent semantics for the factor matrix :param r: :param matrix: :return: top 'r' latent semantics """ latent_semantics = [] for latent_semantic in matrix: if len(latent_semantics) == r: break latent_semantics.append(latent_semantic) return latent_semantics def print_partitioned_entities(self, groupings): """ Pretty print groupings :param groupings: """ for key in groupings.keys(): print(key) if len(groupings[key]) == 0: print("NO ELEMENTS IN THIS GROUP\n") continue for entity in groupings[key]: print(entity, end="|") print("\n") def print_latent_semantics(self, latent_semantics, entity_names_list): """ Pretty print latent semantics :param latent_semantics: :param entity_names_list: """ for latent_semantic in latent_semantics: print("Latent Semantic:") dict1 = {} for i in range(0, len(entity_names_list)): dict1[entity_names_list[i]] = float(latent_semantic[i]) for s in sorted(dict1, key=dict1.get, reverse=True): # value-based sorting print(str(s) + "*(" + str(dict1[s]) + ")", end="") print(" + ", end="") print("\n") def CPDecomposition(self, tensor, rank): """ Perform CP Decomposition :param tensor: :param rank: :return: factor matrices obtained after decomposition """ factors = decomp.parafac(tensor, rank) return factors def SVD(self, matrix): """ Perform SVD :param matrix: :return: factor matrices and the core matrix """ U, s, Vh = linalg.svd(matrix, full_matrices=False) return (U, s, Vh) def PCA(self, matrix): """ Perform PCA :param matrix: :return: factor matrices and the core matrix """ # Computng covariance matrix cov_df = numpy.cov(matrix, rowvar=False) # Calculating PCA U, s, Vh = linalg.svd(cov_df) return (U, s, Vh) def LDA(self, input_compound_list, num_topics, num_features): """ Perform LDA :param input_compound_list: :param num_topics: :param num_features: :return: topics and object topic distribution """ # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(input_compound_list) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in input_compound_list] # generate LDA model lda = gensim.models.ldamodel.LdaModel(corpus, num_topics, id2word=dictionary, passes=20) latent_semantics = lda.print_topics(num_topics, num_features) # for latent in latent_semantics: # print(latent) corpus = lda[corpus] # for i in corpus: # print(i) return corpus, latent_semantics def get_doc_topic_matrix(self, u, num_docs, num_topics): """ Reconstructing data :param u: :param num_docs: :param num_topics: :return: reconstructed data """ u_matrix = numpy.zeros(shape=(num_docs, num_topics)) for i in range(0, len(u)): doc = u[i] for j in range(0, len(doc)): (topic_no, prob) = doc[j] u_matrix[i, topic_no] = prob return u_matrix
class LdaActorTag(object): def __init__(self): super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get( "data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.util = Util() def get_related_actors_lda(self, actorid): """ Function to find similarity between actors using actor-actor similarity vector in tag space using lda :param actorid: :return: """ mov_act = self.data_extractor.get_movie_actor_data() ml_tag = self.data_extractor.get_mltags_data() genome_tag = self.data_extractor.get_genome_tags_data() actor_info = self.data_extractor.get_imdb_actor_info_data() actor_movie_info = mov_act.merge(actor_info, how="left", left_on="actorid", right_on="id") tag_data_frame = ml_tag.merge(genome_tag, how="left", left_on="tagid", right_on="tagId") merged_data_frame = tag_data_frame.merge(actor_movie_info, how="left", on="movieid") merged_data_frame = merged_data_frame.fillna('') tag_df = merged_data_frame.groupby( ['actorid'])['tag'].apply(list).reset_index() tag_df = tag_df.sort_values('actorid') actorid_list = tag_df.actorid.tolist() tag_df = list(tag_df.iloc[:, 1]) (U, Vh) = self.util.LDA(tag_df, num_topics=5, num_features=100000) actor_topic_matrix = self.util.get_doc_topic_matrix( U, num_docs=len(actorid_list), num_topics=5) topic_actor_matrix = actor_topic_matrix.transpose() actor_actor_matrix = numpy.dot(actor_topic_matrix, topic_actor_matrix) numpy.savetxt("actor_actor_matrix_with_svd_latent_values.csv", actor_actor_matrix, delimiter=",") df = pd.DataFrame( pd.read_csv('actor_actor_matrix_with_svd_latent_values.csv', header=None)) matrix = df.values actorids = self.util.get_sorted_actor_ids() index_actor = None for i, j in enumerate(actorids): if j == actorid: index_actor = i break if index_actor == None: print("Actor Id not found.") return None actor_names = [] for actor_id in actorids: actor_name = self.util.get_actor_name_for_id(int(actor_id)) actor_names = actor_names + [actor_name] actor_row = matrix[index_actor].tolist() actor_actor_dict = dict(zip(actor_names, actor_row)) del actor_actor_dict[self.util.get_actor_name_for_id(int(actorid))] # for key in actor_actor_dict.keys(): # actor_actor_dict[key] = abs(actor_actor_dict[key]) actor_actor_dict = sorted(actor_actor_dict.items(), key=operator.itemgetter(1), reverse=True) print(actor_actor_dict[0:10]) return actor_actor_dict[0:10]