예제 #1
0
def get_movie_meta_for(movie_ids: List[int]) -> List[Dict]:
    # if single movie, pack into list
    if isinstance(movie_ids, int):
        movie_ids = [movie_ids]

    movie_ids = filter(lambda x: x is not None, movie_ids)

    meta: pd.DataFrame = Data.movie_meta()

    try:
        # filter metadata
        meta = meta.loc[movie_ids]
    except KeyError as e:
        raise MovieNotFoundException(e.args)

    # fetch metadata for the movies, convert to dictionary
    # orientation='records' results in [{'col1': 'val1', 'col2': 'val2'}, {'col1': 'val1', ..}]
    meta_dict: List[Dict] = meta.to_dict(orient='records')

    for item in meta_dict:
        for col in [
                Column.actors, Column.genres, Column.keywords, Column.directors
        ]:
            if not pd.isnull(item[col.value]):
                item[col.value] = eval(item[col.value])

    add_poster_urls(meta_dict)

    return meta_dict
예제 #2
0
def get_imdb_id(movielens_id: int) -> int:
    movies = Data.movie_meta()
    if movielens_id not in movies.index:
        raise MovieNotFoundException()

    movie = movies.loc[movielens_id]
    return movie[Column.imdb_id.value]
예제 #3
0
    def build_index(cls):
        if cls.ix is None:
            cls.init()

        # automatically calls iw.commit()
        iw = cls.ix.writer()
        for movie_id, movie in Data.movie_meta().iterrows():
            # extract fields
            fields: Dict = {
                'movie_id': movie_id,
                'title': movie[Column.title.value],
                # 'tagline': movie[Column.tagline.value],
                # 'summary': movie[Column.summary.value],
                # 'keywords': movie[Column.keywords.value],
                # 'popularity': movie[Column.num_ratings.value],
                # 'genres': movie[Column.genres.value],
            }
            # filter empty values (inserting fails for np.nan values)
            fields = {
                key: val
                for key, val in fields.items()
                if val is not None and val is not np.nan and val != ''
            }

            # insert into index
            iw.update_document(**fields)

        iw.commit(optimize=True)
def get_year_relevance(movie_id:int, n:int=0):
    release_years= Data.movie_meta()[Column.release_year.value]
    movie_year=release_years.loc[movie_id]
    release_years = release_years.subtract(movie_year)
    release_years = release_years.abs()
    release_years=1-release_years.div(release_years.max())
    return release_years.drop(movie_id)
예제 #5
0
    def get_similarities_for(cls, movie_id: int, colname: str):
        # get similarity matrix (calculate if necessary)
        sim_matrix = cls.calculate_similarities(colname)

        # get absolute index of movie
        index = Data.movie_meta().index.get_loc(movie_id)

        # get similarities for this movie
        # use .toarray() to convert from sparse matrix
        # use [0] to convert "matrix" with only one row to one-dimensional array
        similarities = sim_matrix[index].toarray()[0]

        # put into pandas Series
        # use index=... to apply original index
        series = pd.Series(index=Data.movie_meta().index, data=similarities)

        return series.drop(movie_id)
예제 #6
0
    def __call__(self, movie_id: int, n: int = 5):
        meta = Data.movie_meta()
        collection = meta[get_collection_mask(movie_id, meta)].index.values

        results: pd.Series = self.method(movie_id, n + 10)

        results = results.drop(collection, errors='ignore')

        return results
예제 #7
0
def get_normalized_popularity():
    # used for popularity bias
    popularity = Data.movie_meta()[Column.num_ratings.value]
    # apply root reduce linearity
    # (if movie A has double the ratings of movie B, its popularity should only be slightly higher)
    popularity **= (1 / 10)
    # normalize
    popularity /= popularity.max()
    return popularity
예제 #8
0
def get_movies_with_similar_genres(movie_id: int, n: int = 5, popularity_bias: bool = False
                                   , user_bias: bool = False, movies: pd.DataFrame = None):
    # Get all movies and split them into the base movie and the rest

    if n is None:
        n = 5

    # Use the preferred movie df
    if movies is None:
        all_movies = Data.movie_meta()[Column.genres.value]
    else:
        all_movies = movies[Column.genres.value]

    # get the base out of the df and remove it from the rest
    base_genres = eval(all_movies.loc[movie_id])
    all_movies = all_movies.drop(movie_id)

    # count similar genres
    all_movies = all_movies.apply(
        lambda row: count_elements_in_set(row, base_genres)
    )
    # remove all movies which have no genre in common
    filtered_movies_sum = all_movies[all_movies > 0]

    # if user_bias is true
    if user_bias:
        # reduce the amount of movies to n * 10 movies
        top_n_mul_ten = filtered_movies_sum.nlargest(n * 10)
        ratings = Data.ratings()

        # group by movie
        ratings_grouped = ratings.groupby(str(Column.movie_id))
        # calculate mean rating and number of ratings for each movie
        # (select rating to remove first level of column index. before: (rating: (mean, count)), after: (mean, count) )
        measures: pd.DataFrame = ratings_grouped.agg(['mean', 'count'])[str(Column.rating)]

        # merging mean, count and genre sum into one DataFrame
        measures_movies = pd.merge(measures, pd.DataFrame(top_n_mul_ten), left_index=True, right_index=True)

        if popularity_bias:
            # give more weight to the number of ratings (~popularity)
            # by raising the avg ratings to some power (to preserve some notion of good vs. bad ratings)
            # and multiplying the count back in
            # additionally multiply the genre back in
            # to prevent good rated movies with little correlation to the genres
            results = measures_movies.eval('(mean ** 3) * count * genres')
        else:
            # multiply genre to prevent good rated movies with little correlation to the genres
            results = measures_movies.eval('mean * genres')
    else:
        results = filtered_movies_sum

    # breakpoint()
    return results
예제 #9
0
    def calculate_similarities(cls,
                               colname: str,
                               overwrite_existing: bool = False):
        if colname not in cls.similarity_matrices or overwrite_existing:
            # calculate tf_idf for column
            tfidf_matrix = cls.tf_idf.fit_transform(
                Data.movie_meta()[colname].fillna(''))
            # calculate similarities between movies
            # use dense_output=False (results in sparse matrix) to reduce memory usage
            cls.similarity_matrices[colname] = linear_kernel(
                tfidf_matrix, tfidf_matrix, dense_output=False)

        return cls.similarity_matrices[colname]
def tmdb_reference(movie_id: int, n: int = 5):
    movie = Data.movie_meta().loc[movie_id]
    # get list from string representation
    similar_tmdb = eval(movie[Column.tmdb_similar.value])

    # get movielens id from tmdb_id
    similar = map(lambda tmdb_id: get_movielens_id(tmdb_id=tmdb_id),
                  similar_tmdb)

    # return with artificial decreasing score
    return pd.Series({
        item: -index
        for index, item in enumerate(similar) if item is not None
    })
예제 #11
0
def get_movielens_id(tmdb_id: int = None, imdb_id: int = None) -> int:
    movies: pd.DataFrame = Data.movie_meta()

    if tmdb_id is not None:
        movie = movies.query(f'{Column.tmdb_id.value} == {tmdb_id}')
    elif imdb_id is not None:
        movie = movies.query(f'{Column.imdb_id.value} == {imdb_id}')
    else:
        return None

    if movie.empty:
        return None

    return movie.index[0]
예제 #12
0
def _recommend_movies(movie_id: int, n: int, method: Method) -> List[Dict]:
    if movie_id not in Data.movie_meta().index:
        raise MovieNotFoundException

    # start with the movie itself
    movies: List[int] = [movie_id]

    # calculate similarities
    scores: Series = method(movie_id)
    # and filter out any movies that were recommended recently
    scores = History.filter(scores)

    if method == Method.reference or method == Method.sequels:
        n = 20

    # movies = [base_movie, ...recommendations]
    movies.extend(scores.nlargest(n).index)
    # add recommendations for movies
    History.append(movies)

    return get_movie_meta_for(movies)
예제 #13
0
def get_collection(movie_id: int,
                   df: pd.DataFrame = None,
                   include_base_movie: bool = True,
                   start_from_base_movie: bool = False,
                   wrap_to_start: bool = False) -> pd.DataFrame:
    """
    Get movies from a collection.

    :param movie_id: a movie that is in a collection
    :param df: the pandas DataFrame to search
    :param include_base_movie: whether to include movie_id itself in the result
    :param start_from_base_movie: whether to split the result and start at movie_id
    :param wrap_to_start: if start_from_base_movie: at the end of the collection, wrap over to the start and include the prequels
    :return: a DataFrame containing the movies in the collection
    """
    if df is None:
        df = Data.movie_meta()

    # select movies that are in collection
    m = df[get_collection_mask(movie_id, df)]
    # sort by release year
    m = m.sort_values(by=Column.release_date.value)

    if not include_base_movie:
        m = m.drop(movie_id)

    if start_from_base_movie:
        # split dataframe at base_movie
        sequels = m.loc[movie_id:]
        prequels = m.loc[:movie_id - 1]

        if wrap_to_start:
            # reverse order and join again
            m = pd.concat([sequels, prequels])
        else:
            # just return the movies starting with the base movie
            m = sequels

    return m
예제 #14
0
    def search(cls, query_text: str, n: int, add_posters: bool = True):
        # this method applies a popularity bias to search results
        # as they need to be resorted, more search terms should be provided than necessary,
        # to be able to recover popular results that have rather low scores
        results = cls._search(query_text, n + 25)

        # encapsulate in pandas.Series for further operations
        scores = pd.Series(results, name='score')
        # perform a (right outer) join to connect the search results to the metadata
        df = Data.movie_meta().join(scores, how='right')
        # calculate the weighted score by raising it to some power
        # in order for the popularity to not overpower the score completely
        # and multiply with the number of ratings (the popularity)
        df.eval(f'weighted = score**16 * {Column.num_ratings.value}',
                inplace=True)

        # extract the n best results and export as list
        movie_ids = list(df.nlargest(n, 'weighted').index)
        # fetch metadata
        meta = get_movie_meta_for(movie_ids)

        return meta
예제 #15
0
    def get_poster_omdb_ml(cls, movielens_id: int) -> str:
        from util.data import Data, Column

        imdb_movie_id = Data.movie_meta().at[movielens_id,
                                             Column.imdb_id.value]
        return cls.get_poster_omdb_imdb(imdb_movie_id=imdb_movie_id)
예제 #16
0
def sample(movie_id: int, n: int = 5) -> pd.Series:
    # just return the movies with default ordering
    return -Data.movie_meta()['movielens_id']
예제 #17
0
def recommend_movie_meta(movie_id: int,
                         n: int = 5,
                         popularity_bias: bool = False,
                         user_bias: bool = False):
    # Get movie_meta data and set the index on movie_id
    movies_meta = Data.movie_meta()
    # Get the meta data from the base movie
    base_movie_meta = movies_meta.loc[movie_id, :]

    # filtered movies based on color and adult
    filtered_movies = movies_meta.query('tmdb_adult == {}'.format(
        base_movie_meta['tmdb_adult']))
    filtered_movies = filtered_movies.query('imdb_color == "{}"'.format(
        base_movie_meta['imdb_color']))

    # filtered movies based on genre
    movies = genre_filter.get_movies_with_similar_genres(
        movie_id, n, movies=filtered_movies)

    # merge the number of similar genres back to the main df
    merged_movies = pd.merge(pd.DataFrame(movies),
                             filtered_movies,
                             left_index=True,
                             right_index=True)
    merged_movies = merged_movies.rename(
        columns={"{}_x".format(Column.genres.value): Column.genres.value})

    # preparing data for the score calculation
    # count similar items in the columns or calculate the difference
    merged_movies = calculate_column(merged_movies, base_movie_meta, 'actors')
    merged_movies = calculate_column(merged_movies, base_movie_meta,
                                     'directors')
    merged_movies = calculate_column(merged_movies, base_movie_meta,
                                     'tmdb_keywords')
    merged_movies = calculate_column(merged_movies, base_movie_meta,
                                     'tmdb_production_countries')
    merged_movies = calculate_column(merged_movies,
                                     base_movie_meta,
                                     'release_year',
                                     year=True)

    # score calculation
    score = compute_score(merged_movies)

    # calculate the ranking with the avg user rating
    if user_bias:
        # get the ratings/results like in recommend_movie
        ratings = Data.ratings().query('movie_id != %s' % movie_id)
        merged_ratings = pd.merge(ratings,
                                  merged_movies,
                                  left_on='movie_id',
                                  right_index=True)

        # group by movie
        ratings_grouped = merged_ratings.groupby('movie_id')
        # calculate mean rating and number of ratings for each movie
        # (select rating to remove first level of column index. before: (rating: (mean, count)), after: (mean, count) )
        measures: pd.DataFrame = ratings_grouped.agg(['mean',
                                                      'count'])['rating']

        # merging mean, count and genre sum into one DataFrame
        measures_movies = pd.merge(measures,
                                   pd.DataFrame(score),
                                   left_index=True,
                                   right_index=True)
        measures_movies = measures_movies.rename(columns={0: 'score'})

        # additionally calculate it with the popularity of the movies
        if popularity_bias:
            # give more weight to the number of ratings (~popularity)
            # by raising the avg ratings to some power (to preserve some notion of good vs. bad ratings)
            # and multiplying the count back in
            # additionally multiply the genre back in
            # to prevent good rated movies with little correlation to the genres
            results = measures_movies.eval('((mean * score) ** 3) * count')
        else:
            # multiply genre to prevent good rated movies with little correlation to the genres
            results = measures_movies.eval('mean * score')

    else:
        results = score

    return results
예제 #18
0
def get_genre_as_lists():
    return Data.movie_meta()[Column.genres.value].map(eval)
예제 #19
0
def directors_as_lists():
    return Data.movie_meta()[Column.directors.value].map(eval)
예제 #20
0
def actors_as_lists():
    # since eval (convert string representation to object) is costly time-wise, cache results
    return Data.movie_meta()[Column.actors.value].map(eval)
예제 #21
0
def drop_collection(movie_id: int, df: pd.DataFrame = None) -> pd.DataFrame:
    if df is None:
        df = Data.movie_meta()

    return df[~get_collection_mask(movie_id, df)]