Пример #1
0
def plot_top_by_tweet_count(max_films=10, crtiical_period=True):
    """
    Function to plot films with the top tweet counts
    
    :param max_films: integer for the maximum films to plot
    :param critical_period: bool indicating whether or not to only look at tweets in the critical period
    """

    fig = plt.figure()
    ax = fig.add_axes([0, 0, 1, 1])

    #count tweets over defined time scale for each movie
    column = "tweet_count"
    if crtiical_period:
        movies_df["critical_period_tweet_count"] = movies_df.apply(
            lambda row: movie_helper.count_tweets(row["movieId"], row[
                "critical_start"], row["critical_end"])['count'],
            axis=1)
        column = "critical_period_tweet_count"
    else:
        movies_df["tweet_count"] = movies_df["movieId"].apply(
            lambda x: movie_helper.count_tweets(int(x))['count'])

    #sort in descending order by tweet count and take the max films
    sorted_df = movies_df.sort_values(by=column,
                                      ascending=False).head(max_films)

    #create bar plot
    ax.bar(sorted_df["title"], sorted_df[column])
    ax.set_ylabel("Tweet Count")
    ax.set_title("Top {0} by tweet counts".format(max_films))
    plt.xticks(rotation=90)
    plt.show()
Пример #2
0
def plot_tweets_vs_finance(column,
                           title,
                           xlabel,
                           ylabel,
                           movie_run=False,
                           logx=False,
                           logy=False):
    """
    Function to plot the tweet counts vs finance column for movies df
    
    :param column: string column name of variable to plot
    :param title: string title for plot
    :param xlabel: string x label for plot
    :param ylabel: string y label for plot
    :param movie_run: bool indicating if tweets should only be counted over the cinema rub
    :param logx: bool indicating wether to use log scale on x axis
    :param logy: bool indicating wether to use log scale on y axis
    """

    #normalize column into millions of usd
    movies_df["temp_col"] = movies_df[column].replace(
        '[\£,]', '', regex=True).astype(float) / 1000000

    #check if we need to only look at the movie run
    if movie_run:
        #do a loop?
        movies = movie_helper.gen_movies(movies_df)

        tweet_counts = []
        for movie in movies:
            tweet_counts.append(movie.get_geotweet_count_by_dates())

        movies_df["tweet_count"] = tweet_counts
    else:
        #count all tweets
        movies_df["tweet_count"] = movies_df["movieId"].apply(
            lambda x: movie_helper.count_tweets(int(x))['count'])

    #create scatter plot with regression line
    ax = sns.regplot(x="temp_col", y="tweet_count", data=movies_df)

    #check axis log sclaes
    if logx:
        ax.set_xscale('log')

    if logy:
        ax.set_yscale('log')

    ax.set(xlabel=xlabel, ylabel=ylabel)
    plt.title(title)
    plt.show()
Пример #3
0
def correlatte_movie_stats():
    """
    Function to create correlation of base stats for movies
    """

    #https://towardsdatascience.com/better-heatmaps-and-correlation-matrix-plots-in-python-41445d0f2bec

    #get all needed tweet columns
    movies_df = movie_helper.get_movies_df_with_opening_weekend()

    movies_df["tweet_count"] = movies_df.apply(
        lambda row: movie_helper.count_tweets(row.movieId)['count'], axis=1)
    movies_df["budget_usd"] = movies_df["budget_usd"].replace(
        '[\£,]', '', regex=True).astype(float) / 1000000
    movies_df["uk_gross_usd"] = movies_df["uk_gross_usd"].replace(
        '[\£,]', '', regex=True).astype(float) / 1000000
    movies_df["domestic_gross_usd"] = movies_df["domestic_gross_usd"].replace(
        '[\£,]', '', regex=True).astype(float) / 1000000
    movies_df[
        "worldwide_gross_usd"] = movies_df["worldwide_gross_usd"].replace(
            '[\£,]', '', regex=True).astype(float) / 1000000
    movies_df["international_gross_usd"] = movies_df[
        "international_gross_usd"].replace('[\£,]', '',
                                           regex=True).astype(float) / 1000000
    movies_df["gross_profit_usd"] = movies_df["gross_profit_usd"].replace(
        '[\£,]', '', regex=True).astype(float) / 1000000
    movies_df["opening_weekend_takings"] = movies_df[
        "opening_weekend_takings"].replace('[\£,]', '',
                                           regex=True).astype(float) / 1000000

    #list of columsn to use for correlation
    columns = [
        'budget_usd', 'uk_gross_usd', 'domestic_gross_usd',
        'worldwide_gross_usd', 'international_gross_usd', 'gross_profit_usd',
        'return_percentage', 'uk_percentage', 'tweet_count',
        'total_release_weeks', 'first_run_weeks', 'best_rank',
        'weekends_at_best_rank', 'weekends_in_top_3', 'weekends_in_top_5',
        'weekends_in_top_10', 'weekends_in_top_15', 'opening_weekend_takings',
        'run_up_tweets', 'opening_tweets'
    ]

    #generate heatmap
    generate_heatmap_from_df(df, columns)
Пример #4
0
def gen_bottom_20_tweet_count():
    """
    Function to plot the bottom 20 movies by tweet count 
    """

    #get movies from db and count tweets
    movies_df = movie_helper.get_movies_df()
    movies_df["tweet_count"] = movies_df.apply(
        lambda row: movie_helper.count_tweets(row['movieId'])['count'], axis=1)

    #sort values and take bottom 20
    movies_df = movies_df.sort_values(by='tweet_count').head(20)

    #do bar plot
    plt.barh(movies_df["title"], movies_df["tweet_count"], color='green')
    plt.ylabel('Movie Title')
    plt.xlabel('Tweet Count')
    plt.title('Bottom 20 Movies')
    plt.show()