-
Notifications
You must be signed in to change notification settings - Fork 0
/
filmPlot.py
205 lines (166 loc) · 7.37 KB
/
filmPlot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
'''
A script for plotting the results of the MCMC performed in the script
filmMCMC.py
Error bar plots are created and a gaussian process regression is employed to
find the trend in the data.
Plots are created for the global average of film runtimes and the deviations
from that average for different categories. The categories are a set of
countries, languages and genres.
'''
import pandas as pd
import codecs
import numpy as np
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
import sys
DIRECTORY = sys.path[0]
#import the results of the MCMC
df=pd.read_csv(DIRECTORY + '/results.csv', encoding='utf-8')
df.sort(columns='date', inplace=True)
#import the names of the countries, languages and genres
with codecs.open(DIRECTORY + '/categories.txt', 'r', 'utf-8') as f:
countries=f.readline()
countries=np.array(countries.strip('\n').split(', '))
languages=f.readline()
languages=np.array(languages.strip('\n').split(', '))
genres=f.readline()
genres=np.array(genres.strip('\n').split(', '))
def plot_tool(data, title, folder):
'''
Creates an error bar plot overlaid with the result of a gaussian process
regression. This can be performed for the global average and for the
deviations from the average. The latter is only plotted if there is data
for both writer/director overlap and non-overlap over at least 50 years
with 10 or more films in each year. This ensures that the gaussian
process regression is good and meaningful (this is quite conservative,
the effects of relaxing this should be explored).
Parameters
----------
data: dataframe
pandas dataframe containing 'date' which is the year the films were
first released, 'QSame' which is True when one of the writers
directed the movie and False otherwise (and it is a sentinal value
of -1 when the category is the Global average), 'number' which are
the number of films released that year, 'linMean' which is the
average runtime of films that year and 95%errors in columns called
'lin95Low' and 'lin95Hi'.
title: string
category of the plotted data e.g. "Romance" or "French"
folder: string
the filepath into which the plots should be saved.
'''
plt.figure(title, figsize=(13, 6))
plt.title(title, fontsize=30)
if title == 'Global':
plot_data(data, 'green', 'Global Average')
plot_gaussian(data, 'green', 'Global Average')
else:
#seperate data into writer/director overlap and non-overlap
#demand that the results for each year be based on at least 10 films
same = data[(data.QSame == 'True') & (data.number>=10)]
diff = data[(data.QSame == 'False') & (data.number>=10)]
#demand that there must be at least fifty years worth of data
min_years = 50
if len(same)<min_years or len(diff)<min_years:
plt.close()
return
#save plots to different subfolders depending on their category
if title in countries:
folder +='/countries'
elif title in languages:
folder += '/languages'
else:
folder += '/genres'
plot_data(same, 'red', 'same')
plot_data(diff, 'blue', 'different')
plot_gaussian(same, 'red')
plot_gaussian(diff, 'blue')
plt.ylim(-60, 150)
plt.xlim(1903, 2016)
plt.xlabel('Year', fontsize=25)
plt.ylabel('Minutes', fontsize=25)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.subplots_adjust(left=0.10, bottom=0.18, top=0.90, right=0.95, \
wspace=0, hspace=0)
plt.legend(loc='best', fontsize=24, ncol=1, frameon=False)
plt.savefig(folder +'/'+title+'.pdf')
plt.close()
def plot_data(data, col, label):
'''
Plots the results of the MCMC with errorbars
Parameters
----------
data: dataframe
pandas dataframe containing 'date', 'linMean' which is the average
runtime and 95%errors in columns called 'lin95Low' and 'lin95Hi'.
col: string
the color in which the plot the data
label: string
entry for the plots legend. If the data are from films where the
writer also directed then this could be 'same' otherwise is could be
'different'
'''
#extract the results from the dataframe
year = np.float64(data['date'].values)
mean = data['linMean'].values
low = data['lin95Low'].values
hi = data['lin95Hi'].values
# add some jitter in the x-dimension to clearly seperate same and diff
# data points
if label == 'same':
year += .1
elif label=='different':
year -= .1
#calculate the 95% interval
lowerBound = mean - low
upperBound = hi - mean
plt.errorbar(year, mean, yerr=[lowerBound,upperBound], fmt='.', color=col, \
capsize=0, lw=2, markersize=8, alpha = 0.7)
#add ledend
plt.plot ([],[],color=col,linewidth=3,label=label)
plt.legend(loc=0, fontsize=24, ncol=2, frameon=False)
plt.draw()
def plot_gaussian(data, col):
'''
Plots the gaussian process regression with a characteristic length scale
of 10 years. Essentially this highlights the 'slow trend' in the data.
Parameters
----------
data: dataframe
pandas dataframe containing 'date', 'linMean' which is the average
runtime and 'linSD' which is the standard deviation.
col: string
the color in which the plot the data
'''
#extract the results from the dataframe
Year = np.array(data[u'date'].tolist())
Mean = np.array(data[u'linMean'].tolist())
SD = np.array(data[u'linSD'].tolist())
#initialize the gaussian process. Note that the process is calculated with a
#length scale of 10years to give the 'slow trend' in the results.
length_scale = 10.
kernel = 1.* RBF(length_scale)
gp = GaussianProcessRegressor(kernel=kernel, sigma_squared_n=(SD) ** 2, \
normalize_y=True)
#now fit the data and get the predicted mean and standard deviation
#Note: for reasons that are unclear, GaussianProcessRegressor won't take 1D
#arrays so the data are converted to 2D and then converted back for plotting
gp.fit(np.atleast_2d(Year).T, np.atleast_2d(Mean).T)
Year_array = np.atleast_2d(np.linspace(min(Year)-2, max(Year)+2, 100)).T
Mean_prediction, SD_prediction = gp.predict(Year_pred, return_std=True)
Year_array=Year_array.ravel()
Mean_prediction=Mean_prediction.ravel()
#plot the predicted best fit
plt.plot(Year_array, Mean_prediction, col, alpha=1)
#plot the 95% confidence interval
plt.fill_between(Year_array, (Mean_prediction - 1.9600 * SD_prediction), \
y2=(Mean_prediction + 1.9600 * SD_prediction), alpha=0.5, \
color=col)
plt.draw()
if __name__=='__main__':
grp=df.groupby('category')
for name, categoryDF in grp:
print name
plot_tool(categoryDF, name, DIRECTORY+'/plots')