def plot_top_by_tweet_count(max_films=10, crtiical_period=True): """ Function to plot films with the top tweet counts :param max_films: integer for the maximum films to plot :param critical_period: bool indicating whether or not to only look at tweets in the critical period """ fig = plt.figure() ax = fig.add_axes([0, 0, 1, 1]) #count tweets over defined time scale for each movie column = "tweet_count" if crtiical_period: movies_df["critical_period_tweet_count"] = movies_df.apply( lambda row: movie_helper.count_tweets(row["movieId"], row[ "critical_start"], row["critical_end"])['count'], axis=1) column = "critical_period_tweet_count" else: movies_df["tweet_count"] = movies_df["movieId"].apply( lambda x: movie_helper.count_tweets(int(x))['count']) #sort in descending order by tweet count and take the max films sorted_df = movies_df.sort_values(by=column, ascending=False).head(max_films) #create bar plot ax.bar(sorted_df["title"], sorted_df[column]) ax.set_ylabel("Tweet Count") ax.set_title("Top {0} by tweet counts".format(max_films)) plt.xticks(rotation=90) plt.show()
def plot_tweets_vs_finance(column, title, xlabel, ylabel, movie_run=False, logx=False, logy=False): """ Function to plot the tweet counts vs finance column for movies df :param column: string column name of variable to plot :param title: string title for plot :param xlabel: string x label for plot :param ylabel: string y label for plot :param movie_run: bool indicating if tweets should only be counted over the cinema rub :param logx: bool indicating wether to use log scale on x axis :param logy: bool indicating wether to use log scale on y axis """ #normalize column into millions of usd movies_df["temp_col"] = movies_df[column].replace( '[\£,]', '', regex=True).astype(float) / 1000000 #check if we need to only look at the movie run if movie_run: #do a loop? movies = movie_helper.gen_movies(movies_df) tweet_counts = [] for movie in movies: tweet_counts.append(movie.get_geotweet_count_by_dates()) movies_df["tweet_count"] = tweet_counts else: #count all tweets movies_df["tweet_count"] = movies_df["movieId"].apply( lambda x: movie_helper.count_tweets(int(x))['count']) #create scatter plot with regression line ax = sns.regplot(x="temp_col", y="tweet_count", data=movies_df) #check axis log sclaes if logx: ax.set_xscale('log') if logy: ax.set_yscale('log') ax.set(xlabel=xlabel, ylabel=ylabel) plt.title(title) plt.show()
def correlatte_movie_stats(): """ Function to create correlation of base stats for movies """ #https://towardsdatascience.com/better-heatmaps-and-correlation-matrix-plots-in-python-41445d0f2bec #get all needed tweet columns movies_df = movie_helper.get_movies_df_with_opening_weekend() movies_df["tweet_count"] = movies_df.apply( lambda row: movie_helper.count_tweets(row.movieId)['count'], axis=1) movies_df["budget_usd"] = movies_df["budget_usd"].replace( '[\£,]', '', regex=True).astype(float) / 1000000 movies_df["uk_gross_usd"] = movies_df["uk_gross_usd"].replace( '[\£,]', '', regex=True).astype(float) / 1000000 movies_df["domestic_gross_usd"] = movies_df["domestic_gross_usd"].replace( '[\£,]', '', regex=True).astype(float) / 1000000 movies_df[ "worldwide_gross_usd"] = movies_df["worldwide_gross_usd"].replace( '[\£,]', '', regex=True).astype(float) / 1000000 movies_df["international_gross_usd"] = movies_df[ "international_gross_usd"].replace('[\£,]', '', regex=True).astype(float) / 1000000 movies_df["gross_profit_usd"] = movies_df["gross_profit_usd"].replace( '[\£,]', '', regex=True).astype(float) / 1000000 movies_df["opening_weekend_takings"] = movies_df[ "opening_weekend_takings"].replace('[\£,]', '', regex=True).astype(float) / 1000000 #list of columsn to use for correlation columns = [ 'budget_usd', 'uk_gross_usd', 'domestic_gross_usd', 'worldwide_gross_usd', 'international_gross_usd', 'gross_profit_usd', 'return_percentage', 'uk_percentage', 'tweet_count', 'total_release_weeks', 'first_run_weeks', 'best_rank', 'weekends_at_best_rank', 'weekends_in_top_3', 'weekends_in_top_5', 'weekends_in_top_10', 'weekends_in_top_15', 'opening_weekend_takings', 'run_up_tweets', 'opening_tweets' ] #generate heatmap generate_heatmap_from_df(df, columns)
def gen_bottom_20_tweet_count(): """ Function to plot the bottom 20 movies by tweet count """ #get movies from db and count tweets movies_df = movie_helper.get_movies_df() movies_df["tweet_count"] = movies_df.apply( lambda row: movie_helper.count_tweets(row['movieId'])['count'], axis=1) #sort values and take bottom 20 movies_df = movies_df.sort_values(by='tweet_count').head(20) #do bar plot plt.barh(movies_df["title"], movies_df["tweet_count"], color='green') plt.ylabel('Movie Title') plt.xlabel('Tweet Count') plt.title('Bottom 20 Movies') plt.show()