示例#1
0
class ActorTag(object):
    """
    Class to relate actors and tags.
    """
    def __init__(self):
        """
        Initializing the data extractor object to get data from the csv files
        """
        self.data_set_loc = conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)

    def assign_idf_weight(self, data_series, unique_tags):
        """
        This function computes the idf weight for all tags in a data frame,
        considering each movie as a document
        :param data_frame:
        :param unique_tags:
        :return: dictionary of tags and idf weights
        """
        idf_counter = {tag: 0 for tag in unique_tags}
        for tag_list in data_series:
            for tag in tag_list:
                idf_counter[tag] += 1
        for tag, count in list(idf_counter.items()):
            idf_counter[tag] = math.log(len(data_series.index) / count, 2)
        return idf_counter

    def assign_tf_weight(self, tag_series):
        """
        This function computes the tf weight for all tags for a movie
        :param tag_series:
        :return: dictionary of tags and tf weights
        """
        counter = Counter()
        for each in tag_series:
            counter[each] += 1
        total = sum(counter.values())
        for each in counter:
            counter[each] = (counter[each] / total)
        return dict(counter)

    def assign_rank_weight(self, data_frame):
        """
        This function assigns a value for all the actors in a movie on a scale of 100,
         based on their rank in the movie.
        :param tag_series:
        :return: dictionary of (movieid, actor_rank) to the computed rank_weight
        """
        groupby_movies = data_frame.groupby("movieid")
        movie_rank_weight_dict = {}
        for movieid, info_df in groupby_movies:
            max_rank = info_df.actor_movie_rank.max()
            for rank in info_df.actor_movie_rank.unique():
                movie_rank_weight_dict[(
                    movieid, rank)] = (max_rank - rank + 1) / max_rank * 100
        return movie_rank_weight_dict

    def get_model_weight(self, tf_weight_dict, idf_weight_dict,
                         rank_weight_dict, tag_df, model):
        """
        This function combines tf_weight on a scale of 100, idf_weight on a scale of 100,
        actor_rank for each tag on scale of 100 and timestamp_weight on a scale of 10 , based on the model.
        :param tf_weight_dict, idf_weight_dict, rank_weight_dict, tag_df, model
        :return: data_frame with column of the combined weight
        """
        if model == "TF":
            tag_df["value"] = pd.Series(
                [(tf_weight_dict.get(movieid, 0).get(tag, 0) * 100) +
                 rank_weight_dict.get((movieid, rank), 0)
                 for index, ts_weight, tag, movieid, rank in zip(
                     tag_df.index, tag_df.timestamp_weight, tag_df.tag,
                     tag_df.movieid, tag_df.actor_movie_rank)],
                index=tag_df.index)
        else:
            tag_df["value"] = pd.Series(
                [(ts_weight +
                  (tf_weight_dict.get(movieid, 0).get(tag, 0) *
                   (idf_weight_dict.get(tag, 0)) * 100) + rank_weight_dict.get(
                       (movieid, rank), 0))
                 for index, ts_weight, tag, movieid, rank in zip(
                     tag_df.index, tag_df.timestamp_weight, tag_df.tag,
                     tag_df.movieid, tag_df.actor_movie_rank)],
                index=tag_df.index)
        return tag_df

    def combine_computed_weights(self, data_frame, rank_weight_dict,
                                 idf_weight_dict, model):
        """
        Triggers the weighing process and sums up all the calculated weights for each tag
        :param data_frame:
        :param rank_weight_dict:
        :param model:
        :return: dictionary of tags and weights
        """
        tag_df = data_frame.reset_index()
        temp_df = tag_df.groupby(
            ['movieid'])['tag'].apply(lambda x: ','.join(x)).reset_index()
        movie_tag_dict = dict(zip(temp_df.movieid, temp_df.tag))
        tf_weight_dict = {
            movie: self.assign_tf_weight(tags.split(','))
            for movie, tags in list(movie_tag_dict.items())
        }
        tag_df = self.get_model_weight(tf_weight_dict, idf_weight_dict,
                                       rank_weight_dict, tag_df, model)
        tag_df["total"] = tag_df.groupby(['tag'])['value'].transform('sum')
        tag_df = tag_df.drop_duplicates("tag").sort_values("total",
                                                           ascending=False)
        actor_tag_dict = dict(zip(tag_df.tag, tag_df.total))
        return actor_tag_dict

    def merge_movie_actor_and_tag(self, actorid, model):
        """
        Merges data from different csv files necessary to compute the tag weights for each actor,
        assigns weights to timestamp.
        :param actorid:
        :param model:
        :return: returns a dictionary of Actors to dictionary of tags and weights.
        """
        mov_act = self.data_extractor.get_movie_actor_data()
        ml_tag = self.data_extractor.get_mltags_data()
        genome_tag = self.data_extractor.get_genome_tags_data()
        actor_info = self.data_extractor.get_imdb_actor_info_data()
        actor_movie_info = mov_act.merge(actor_info,
                                         how="left",
                                         left_on="actorid",
                                         right_on="id")
        tag_data_frame = ml_tag.merge(genome_tag,
                                      how="left",
                                      left_on="tagid",
                                      right_on="tagId")
        merged_data_frame = actor_movie_info.merge(tag_data_frame,
                                                   how="left",
                                                   on="movieid")
        merged_data_frame = merged_data_frame[
            merged_data_frame['timestamp'].notnull()]
        merged_data_frame = merged_data_frame.drop(["userid"], axis=1)
        rank_weight_dict = self.assign_rank_weight(
            merged_data_frame[['movieid', 'actor_movie_rank']])
        merged_data_frame = merged_data_frame.sort_values(
            "timestamp", ascending=True).reset_index()
        data_frame_len = len(merged_data_frame.index)
        merged_data_frame["timestamp_weight"] = pd.Series(
            [(index + 1) / data_frame_len * 10
             for index in merged_data_frame.index],
            index=merged_data_frame.index)
        if model == 'TFIDF':
            idf_weight_dict = self.assign_idf_weight(
                merged_data_frame.groupby('movieid')['tag'].apply(set),
                merged_data_frame.tag.unique())
            tag_dict = self.combine_computed_weights(
                merged_data_frame[merged_data_frame['actorid'] == actorid],
                rank_weight_dict, idf_weight_dict, model)
        else:
            tag_dict = self.combine_computed_weights(
                merged_data_frame[merged_data_frame['actorid'] == actorid],
                rank_weight_dict, {}, model)

        return tag_dict
class Util(object):
    """
    Class containing all the common utilities used across the entire code base
    """
    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = os.path.join(os.path.abspath(os.path.dirname(__file__)), self.conf.config_section_mapper("filePath").get("data_set_loc"))
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.mlratings = self.data_extractor.get_mlratings_data()
        self.mlmovies = self.data_extractor.get_mlmovies_data()
        self.imdb_actor_info = self.data_extractor.get_imdb_actor_info_data()
        self.genome_tags = self.data_extractor.get_genome_tags_data()

    def get_sorted_actor_ids(self):
        """
        Obtain sorted actor ids
        :return: list of sorted actor ids
        """
        actor_info = self.data_extractor.get_imdb_actor_info_data()
        actorids = actor_info.id
        actorids = actorids.sort_values()
        return actorids

    def get_movie_id(self, movie):
        """
        Obtain name ID for the name passed as input
        :param movie:
        :return: movie id
        """
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['moviename'] == movie]
        movie_id = movie_data['movieid'].unique()

        return movie_id[0]

    def get_average_ratings_for_movie(self, movie_id):
        """
        Obtain average rating for movie
        :param movie_id:
        :return: average movie rating
        """
        all_ratings = self.mlratings
        movie_ratings = all_ratings[all_ratings['movieid'] == movie_id]

        ratings_sum = 0
        ratings_count = 0
        for index, row in movie_ratings.iterrows():
            ratings_count += 1
            ratings_sum += row['rating']

        return ratings_sum / float(ratings_count)

    def get_actor_name_for_id(self, actor_id):
        """
        actor name for id
        :param actor_id:
        :return: actor name for the actor id
        """
        actor_data = self.imdb_actor_info[self.imdb_actor_info['id'] == actor_id]
        name = actor_data['name'].unique()

        return name[0]

    def get_movie_name_for_id(self, movieid):
        """
        movie name for movie id
        :param movieid:
        :return: movie name
        """
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['movieid'] == movieid]
        movie_name = movie_data['moviename'].unique()

        return movie_name[0]

    def get_tag_name_for_id(self, tag_id):
        """
        tag name for tag id
        :param tag_id:
        :return: tag name
        """
        tag_data = self.genome_tags[self.genome_tags['tagId'] == tag_id]
        name = tag_data['tag'].unique()

        return name[0]

    def partition_factor_matrix(self, matrix, no_of_partitions, entity_names):
        """
        Function to partition the factor matrix into groups as per 2-norm distance
        :param matrix:
        :param no_of_partitions:
        :param entity_names:
        :return: dictionary containing the groups
        """
        entity_dict = {}
        for i in range(0, len(matrix)):
            length = 0
            for latent_semantic in matrix[i]:
                length += abs(latent_semantic) ** 2
            entity_dict[entity_names[i]] = math.sqrt(length)

        max_length = float(max(entity_dict.values()))
        min_length = float(min(entity_dict.values()))
        length_of_group = (float(max_length) - float(min_length)) / float(no_of_partitions)

        groups = {}
        for i in range(0, no_of_partitions):
            groups["Group " + str(i + 1) + " ( " + str(min_length + float(i * length_of_group)) + " , " + str(
                min_length + float((i + 1) * length_of_group)) + " )"] = []

        for key in entity_dict.keys():
            entity_length = entity_dict[key]
            group_no = math.ceil(float(entity_length - min_length) / float(length_of_group))
            if group_no == 0:
                group_no = 1
            groups["Group " + str(group_no) + " ( " + str(
                min_length + float((group_no - 1) * length_of_group)) + " , " + str(
                min_length + float(group_no * length_of_group)) + " )"].append(key)

        return groups

    def get_latent_semantics(self, r, matrix):
        """
        Function to obtain the latent semantics for the factor matrix
        :param r:
        :param matrix:
        :return: top 'r' latent semantics
        """
        latent_semantics = []
        for latent_semantic in matrix:
            if len(latent_semantics) == r:
                break
            latent_semantics.append(latent_semantic)

        return latent_semantics

    def print_partitioned_entities(self, groupings):
        """
        Pretty print groupings
        :param groupings:
        """
        for key in groupings.keys():
            print(key)
            if len(groupings[key]) == 0:
                print("NO ELEMENTS IN THIS GROUP\n")
                continue
            for entity in groupings[key]:
                print(entity, end="|")
            print("\n")

    def print_latent_semantics(self, latent_semantics, entity_names_list):
        """
        Pretty print latent semantics
        :param latent_semantics:
        :param entity_names_list:
        """
        for latent_semantic in latent_semantics:
            print("Latent Semantic:")
            dict1 = {}
            for i in range(0, len(entity_names_list)):
                dict1[entity_names_list[i]] = float(latent_semantic[i])
            for s in sorted(dict1, key=dict1.get, reverse=True):  # value-based sorting
                print(str(s) + "*(" + str(dict1[s]) + ")", end="")
                print(" + ", end="")
            print("\n")

    def CPDecomposition(self, tensor, rank):
        """
        Perform CP Decomposition
        :param tensor:
        :param rank:
        :return: factor matrices obtained after decomposition
        """
        factors = decomp.parafac(tensor, rank)
        return factors

    def SVD(self, matrix):
        """
        Perform SVD
        :param matrix:
        :return: factor matrices and the core matrix
        """

        U, s, Vh = linalg.svd(matrix, full_matrices=False)
        return (U, s, Vh)

    def PCA(self, matrix):
        """
        Perform PCA
        :param matrix:
        :return: factor matrices and the core matrix
        """

        # Computng covariance matrix
        cov_df = numpy.cov(matrix, rowvar=False)

        # Calculating PCA
        U, s, Vh = linalg.svd(cov_df)
        return (U, s, Vh)

    def LDA(self, input_compound_list, num_topics, num_features):
        """
        Perform LDA
        :param input_compound_list:
        :param num_topics:
        :param num_features:
        :return: topics and object topic distribution
        """
        # turn our tokenized documents into a id <-> term dictionary
        dictionary = corpora.Dictionary(input_compound_list)

        # convert tokenized documents into a document-term matrix
        corpus = [dictionary.doc2bow(text) for text in input_compound_list]

        # generate LDA model
        lda = gensim.models.ldamodel.LdaModel(corpus, num_topics, id2word=dictionary, passes=20)

        latent_semantics = lda.print_topics(num_topics, num_features)
        # for latent in latent_semantics:
        #     print(latent)

        corpus = lda[corpus]

        # for i in corpus:
        #     print(i)

        return corpus, latent_semantics

    def get_doc_topic_matrix(self, u, num_docs, num_topics):
        """
        Reconstructing data
        :param u:
        :param num_docs:
        :param num_topics:
        :return: reconstructed data
        """
        u_matrix = numpy.zeros(shape=(num_docs, num_topics))

        for i in range(0, len(u)):
            doc = u[i]
            for j in range(0, len(doc)):
                (topic_no, prob) = doc[j]
                u_matrix[i, topic_no] = prob

        return u_matrix
示例#3
0
class LdaActorTag(object):
    def __init__(self):
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.util = Util()

    def get_related_actors_lda(self, actorid):
        """
        Function to find similarity between actors using actor-actor similarity vector in tag space using lda
        :param actorid:
        :return:
        """
        mov_act = self.data_extractor.get_movie_actor_data()
        ml_tag = self.data_extractor.get_mltags_data()
        genome_tag = self.data_extractor.get_genome_tags_data()
        actor_info = self.data_extractor.get_imdb_actor_info_data()
        actor_movie_info = mov_act.merge(actor_info,
                                         how="left",
                                         left_on="actorid",
                                         right_on="id")
        tag_data_frame = ml_tag.merge(genome_tag,
                                      how="left",
                                      left_on="tagid",
                                      right_on="tagId")
        merged_data_frame = tag_data_frame.merge(actor_movie_info,
                                                 how="left",
                                                 on="movieid")

        merged_data_frame = merged_data_frame.fillna('')
        tag_df = merged_data_frame.groupby(
            ['actorid'])['tag'].apply(list).reset_index()

        tag_df = tag_df.sort_values('actorid')
        actorid_list = tag_df.actorid.tolist()
        tag_df = list(tag_df.iloc[:, 1])

        (U, Vh) = self.util.LDA(tag_df, num_topics=5, num_features=100000)

        actor_topic_matrix = self.util.get_doc_topic_matrix(
            U, num_docs=len(actorid_list), num_topics=5)
        topic_actor_matrix = actor_topic_matrix.transpose()
        actor_actor_matrix = numpy.dot(actor_topic_matrix, topic_actor_matrix)

        numpy.savetxt("actor_actor_matrix_with_svd_latent_values.csv",
                      actor_actor_matrix,
                      delimiter=",")

        df = pd.DataFrame(
            pd.read_csv('actor_actor_matrix_with_svd_latent_values.csv',
                        header=None))
        matrix = df.values

        actorids = self.util.get_sorted_actor_ids()

        index_actor = None
        for i, j in enumerate(actorids):
            if j == actorid:
                index_actor = i
                break

        if index_actor == None:
            print("Actor Id not found.")
            return None

        actor_names = []
        for actor_id in actorids:
            actor_name = self.util.get_actor_name_for_id(int(actor_id))
            actor_names = actor_names + [actor_name]

        actor_row = matrix[index_actor].tolist()
        actor_actor_dict = dict(zip(actor_names, actor_row))
        del actor_actor_dict[self.util.get_actor_name_for_id(int(actorid))]

        # for key in actor_actor_dict.keys():
        #     actor_actor_dict[key] = abs(actor_actor_dict[key])

        actor_actor_dict = sorted(actor_actor_dict.items(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
        print(actor_actor_dict[0:10])
        return actor_actor_dict[0:10]