-
Notifications
You must be signed in to change notification settings - Fork 1
/
Test.py
389 lines (323 loc) · 16.6 KB
/
Test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
import string
from Utilities import Utils
import itertools
from Formatter1 import Formatter as FirstFormatter
from Formatter2 import Formatter as SecondFormatter
from Formatter3 import Formatter as ThirdFormatter
from Formatter4 import Formatter as FourthFormatter
from Formatter5 import Formatter as FifthFormatter
from Formatter6 import Formatter as SixthFormatter
from gensim.models import Word2Vec as wv, Word2Vec
from gensim.models import KeyedVectors as kv
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from prettytable import PrettyTable
model = kv.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
utils = Utils()
stop_words = ["i", "i'm", "it's", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
"yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its",
"itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
"these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
"do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until",
"while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during",
"before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over",
"under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
"both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same",
"so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
non_feminist_words = ['boys ', 'hope ', 'hungry?', 'dress', 'shoes', 'cleaning', 'cooking', 'gentle', 'passive', 'indecisive', 'babyish',
'brainless', 'miserable', 'naive', 'needy', 'obsessive', 'insecure', 'weak', 'wicked',
'accepting', 'adorable', 'caring', 'curvy', 'cute', 'darling', 'dramatic', 'dreaming', 'dreamy',
'kind', 'likable', 'lovable', 'loved', 'loving', 'natural', 'nice', 'petite', 'polite',
'precious', 'pretty', 'sacrificing', 'emotional', 'feminine', 'fit', 'foxy', 'gentle', 'giggly',
'girly', 'hot', 'innocent', 'sensitive', 'sensual', 'soft', 'sweet', 'thin', 'vulnerable',
'womanly', 'shy']
feminist_words = ['here', 'go', 'lets', 'Full ', 'Go', 'secure', 'funny', 'fun', 'extraordinary', 'adventurous', 'leader', 'focused', 'stable', 'curious',
'loud', 'logical', 'ambitious', 'cynical', 'assertive', 'bright', 'fight', 'experienced',
'charismatic', 'resentful', 'liberal', 'free', 'eccentric', 'educated', 'think', 'want', 'brave',
'busy', 'brilliant', 'masculine', 'Strong', 'tough', 'protective', 'hero', 'powerful', 'aggressive',
'smart', 'intelligent', 'books', 'independent', 'leader', 'manager', 'active', 'arrogant', 'dominant']
men_words = ['he', 'men', 'man,''himself', 'boy', 'son', 'his', 'guy', 'father']
women_words = ['she', 'women', 'woman', 'herself', 'girl', 'daughter', 'gal', 'mother']
def safe_model_similarity(x, y):
try:
return model.similarity(x, y)
except KeyError:
return -1
def get_feminist_score(word, lst):
similarity = [safe_model_similarity(word, feminist_word) for feminist_word in lst]
similarity = [x for x in similarity if x == x and x != -1 if x > 0.1]
if len(similarity) > 0:
return np.max(similarity)
return 0
def expand_list(word_list):
word_list = [x for x in word_list if model.similarity('hello', x) != -1] # word exist in corpus
expanded = [[i[0] for i in model.similar_by_word(word, 5)] for word in word_list]
word_list = word_list + list(np.array(expanded).flatten())
return list(set(list(word_list)))
def all_scripts_except_one():
for to_remove_movie in utils.names:
str = ""
for name in utils.names:
if name != to_remove_movie:
my_movie = get_formatter(name)
str = str + " ".join(my_movie.get_tokens()) + "\n"
f = open("scripts_without_" + to_remove_movie + ".txt", "w", encoding='utf-8')
f.write(str)
f.close()
def females():
return ['Jessie', 'Bonnie', 'Bonnie', 'Emily', 'HOLLEY', 'SALLY',
'DORY', 'PEACH', 'Alice', 'Queen', 'Duchess', 'Madame', 'Marie', 'Amelia', 'Abigail',
'Catty', 'Matriarch', 'Joy', 'Sadness', 'Disgust', 'Mom', 'Riley', 'Sally Carrera', 'Flo',
'Cruella de Vil', 'Nanny', 'Perdita', 'Anita',
'Elinor', 'Merinda', 'The Witch', 'Moana', 'Sina', 'Gramma', 'Mulan', 'Granny Fa',
'Ursula', 'Ariel', 'POCAHONTAS', 'GRANDMOTHER WILLOW', 'POCAHONTAS I', 'NAKOMA',
'ANNA', 'ELSA', 'YOUNG ANNA', 'MRS. DAVIS', 'BO PEEP', 'HANNAH', 'HOPPS',
'BELLWETHER', 'BONNIE HOPPS', 'YOUNG JUDY', 'FRU FRU SHREW', 'GAZELLE', 'JASMINE',
'BELLE', 'MRS.POTTS', 'Megara', 'Akela', 'Flora', 'Maleficent',
'Merryweather', 'Fauna', 'Briar Rose', 'Fairies']
def get_number_of_woman_words_in_script(movie):
this_females = females()
this_females = list(map(lambda x: x.lower(), this_females))
return np.sum([character[1] for character in movie.get_chars_tuples() if character[0].lower() in this_females])
def get_number_of_words_in_script(movie):
this_females = females()
this_females = list(map(lambda x: x.lower(), this_females))
this_males = males()
this_males = list(map(lambda x: x.lower(), this_males))
return np.sum([character[1] for character in movie.get_chars_tuples() if (character[0].lower() in this_females or character[0].lower() in this_males)])
def change_range(oldValue):
return (((oldValue - -1) * (5 - 0)) / (1 - -1)) + 0
def get_feminism_score(words_lst, character_words):
score = [get_feminist_score(word, words_lst) for word in character_words]
score = [x for x in score if x == x and x > 0.1]
return np.mean(score) if len(score) > 0 else 0
arr = []
def get_meaning_score(name):
my_movie = get_formatter(name)
movie_characters = my_movie.get_characters()
movie_characters = list(map(lambda x: x.lower(), movie_characters))
character_scores = []
this_females = females()
this_females = list(map(lambda x: x.lower(), this_females))
for character in movie_characters:
if character.lower() in this_females:
words = my_movie.get_char_tokens(character.lower())
character_words = [w for w in words if w not in stop_words]
not_feminist_score = get_feminism_score(non_feminist_words, character_words)
feminist_score = get_feminism_score(feminist_words, character_words)
arr.append([character, (feminist_score - not_feminist_score)])
character_scores.append(
(feminist_score - not_feminist_score) * len(words) /
(get_number_of_words_in_script(my_movie)))
return change_range(np.sum(character_scores))
def get_k_most_similar_to_list(k, lst, words):
similarities_to_men = [[word, np.mean([safe_model_similarity(men_word, word) for men_word in lst])] for word in
words]
return [i[0] for i in sorted(similarities_to_men, key=lambda x: x[1], reverse=True) if i[1] != -1][:k]
# feminist words are related to man or woman more?
def is_disney_corpus_biased():
words = feminist_words + non_feminist_words
men_similarity = get_k_most_similar_to_list(10, men_words, words)
women_similarity = get_k_most_similar_to_list(10, women_words, words)
print(men_similarity)
print(women_similarity)
men_score = [x for x in men_similarity if x in feminist_words]
women_score = [x for x in women_similarity if x in feminist_words]
return len(women_score) - len(men_score)
def is_movie_feminist(name):
my_movie = get_formatter(name)
words = my_movie.get_tokens()
non_feminist_score = get_feminism_score(non_feminist_words, words)
feminist_score = get_feminism_score(feminist_words, words)
return feminist_score - non_feminist_score
def create_word2vec_model_from_scripts():
sentences = []
for name in utils.names:
if name != 'Kung Fu Panda':
my_movie = get_formatter(name)
sentences = sentences + [x['text'].translate(str.maketrans("", "", string.punctuation)).split(" ")[:-1] for
x in
my_movie.script_map]
sentences = [[x for x in sentence if x not in stop_words] for sentence in sentences]
model = Word2Vec(sentences, min_count=1)
return model
def tsne_plot(model):
"Creates and TSNE model and plots it"
labels = []
tokens = []
for word in model.wv.vocab:
tokens.append(model[word])
labels.append(word)
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
plt.figure(figsize=(16, 16))
for i in range(len(x)):
plt.scatter(x[i], y[i])
plt.annotate(labels[i],
xy=(x[i], y[i]),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.show()
plt.savefig('testplot.png')
def get_formatter(name):
tmp = Utils.formatter(name)
return FirstFormatter(name) if tmp == '1' else \
SecondFormatter(name) if tmp == '2' else \
ThirdFormatter(name) if tmp == '3' else \
FourthFormatter(name) if tmp == '4' else \
FifthFormatter(name) if tmp == '5' else SixthFormatter(name)
def get_female_part(chars_map, females, males):
females_sum = 0
sum = 0
for key, val in chars_map:
if key.lower() in females:
# print(key, "==", val)
females_sum += val
sum += val
if key in males:
sum += val
return females_sum / sum
def bechdelTest(arr, females, males):
this_females = []
test2 = False
prev_preson = None
prev_text = None
for curMap in arr:
for key, value in curMap.items():
if key == 'person':
person = value.lower()
else:
text = value
if not prev_preson:
if person in females:
this_females.append(person)
prev_preson = person
prev_text = text
else:
if person in females and person not in this_females:
this_females.append(person)
if (person != prev_preson) and (person in females) and (prev_preson in females):
test2 = True
test3 = True
for male in males:
if (male in text.lower()) or (male in prev_text.lower()):
test3 = False
male_words = ['him', 'he', 'himself', 'his', 'man', 'male', 'boy', 'guy', 'son']
for word in male_words:
if (word in text.lower()) or (word in prev_text.lower()):
test3 = False
if test3:
# print("\n", person, '\ntext1 -', text, "\n", prev_preson, "\ntext2 -", prev_text)
return 3
prev_preson = person
prev_text = text
if len(this_females) < 2:
return 0
if test2:
return 2
return 1
def males():
return ['MATER', 'MCQUEEN', 'FRANCESCO', 'SARGE', 'BRENT MUSTANGBURGER', 'PROFESSOR ZUNDAPP', 'MILES AXLEROD',
'TOMBER', 'GREM', 'ACER', 'UNCLE TOPOLINO', 'ROD REDLINE', 'VLADIMIR TRUNKOV', 'CRABBY',
'Lightning McQueen', 'Tow Mater', 'Doc Hudson', 'Bob Cutlass', 'Luigi', 'Darrell Cartrip',
'Chick', 'Sheriff', 'Mack', 'Lizzie', 'Ramone', 'The King', 'Cameramen', 'Sarge', 'Van',
'Bing Bong', 'Fear', 'Anger', 'Dad', 'Timothy Q. Mouse', 'Ringmaster', 'Mr. Stork',
"O'Malley", 'Edgar', 'Roquefort', 'Napoleon', 'Berlioz', 'Toulouse', 'Georges', 'Lafayette', 'Scat Cat',
'White Rabbit', 'Mad Hatter', 'Dodo', 'March Hare', 'Cheshire Cat', 'Walrus', 'Caterpillar', 'Doorknob', 'King',
'Marlin', 'GILL', 'NEMO', 'NIGEL', 'SHERMAN', 'BRUCE', 'CRUSH', 'BLOAT', 'GURGLE', 'MOONFISH', 'Coral', 'CHUM',
'Pongo', 'Jasper', 'Roger', 'Colonel', 'Sergeant Tibbs', 'Horace', 'Patch', 'Danny',
'JOHN SMITH', 'RATCLIFFE', 'POWHATAN', 'BEN', 'THOMAS', 'LON', 'WIGGINS',
'Maui', 'Tamatoa', 'Tui', 'Fergus', 'Mr. Potato Head', 'Wheezy', 'Ken', 'Chuckles', 'Lotso',
'Mushu', 'Shang', 'Chi Fu', 'Emperor', 'Shan Yu', 'General', 'Yao', 'Ling',
'Scuttle', 'Eric', 'Triton', 'Grimsby', 'Flounder', 'Sebastian', 'KRISTOFF', 'OLAF', 'HANS', 'DUKE',
'GRAND PABBIE', 'OAKEN', 'WOODY', 'BUZZ', 'SID', 'Al', 'Stinky Pete', 'Utility Belt Buzz',
'REX', 'SARGENT', 'ANDY', 'HAMM', 'SLINKY', 'NICK', 'BOGO', 'STU HOPPS', 'CLAWHAUSER',
'LIONHEART', 'DUKE WEASELTON', 'GIDEON GREY', 'YAX THE HIPPIE YAK', 'YAX',
'ALADDIN', 'GENIE', 'JAFAR', 'SULTAN', 'IAGO', 'PEDDLER', 'COGSWORTH', 'GASTON',
'LUMIERE', 'BEAST', 'MAURICE', 'LEFOU', 'CHIP', 'Hercules', 'Hades', 'Phil', 'Zeus', 'Pain',
'Panic', 'Man', 'Amphitryon', 'Hermes', 'Atropos', 'Baloo', 'Bagheera', 'Mowgli', 'Hathi',
'Buzzy', 'Kaa', 'Shere Khan', 'Louie', 'Flaps', 'Dizzy', 'Hathi Jr.', 'Hubert', 'Phillip', 'Stefan']
def get_movie_rate(name):
my_movie = get_formatter(name)
movie_arr = my_movie.script_map
chars_arr = my_movie.get_chars_tuples()
rate = 0
this_females = females()
this_females = list(map(lambda x: x.lower(), this_females))
this_males = males()
this_males = list(map(lambda x: x.lower(), this_males))
rate += ((bechdelTest(movie_arr, this_females, this_males) / 3) * 2)
rate += ((get_female_part((chars_arr), this_females, this_males)) * 3)
female_meaning = get_meaning_score(name)
# return rate, female_meaning
return 0, female_meaning
def get_year(name):
yearMap = {'Frozen': 2013, 'Toy Story': 1995, 'Zootopia': 2016, 'Aladdin': 1992,
'Beauty and the Beast': 1991, 'Hercules': 1997, 'Jungle Book': 1967, 'Sleeping Beauty': 1959,
'Brave': 2012, 'Moana': 2016, 'Mulan': 1998, 'The Little Mermaid': 1989, 'Pocahonta': 1995}
return yearMap[name]
if __name__ == '__main__':
# model = create_word2vec_model_from_scripts()
# for name in utils.names:
# print(name)
# my_movie = get_formatter(name)
# # print(my_movie.get_characters())
# print(my_movie.get_chars_tuples())
#
movies = utils.data['movies']
meaning_rates = []
years = []
final_names = []
rates = []
for movie in movies:
my_movie = get_formatter(movie['name'])
name = movie['name']
this_rate, meaning = get_movie_rate(name)
meaning_rates.append(meaning)
rates.append(this_rate)
years.append(movie['year'])
final_names.append(name)
print(meaning_rates)
min_meaning = np.asarray(meaning_rates).min()
meaning_rates = list(map(lambda x: (x - min_meaning), meaning_rates))
print(meaning_rates)
max_meaning = np.asarray(meaning_rates).max()
meaning_rates = list(map(lambda x: ((x / max_meaning) * 5), meaning_rates))
print(meaning_rates)
t = PrettyTable(['Name', 'Meaning'])
for i in range(0, len(rates)):
rates[i] += meaning_rates[i]
t.add_row([final_names[i], rates[i]])
print(t)
for i in range(0, len(final_names)):
print(final_names[i], " - ", rates[i])
fig, ax = plt.subplots()
colors = np.random.rand(len(years))
area = (100 * np.random.rand(len(years)))
ax.scatter(years, rates, s=area, c=colors, alpha=0.5)
# ax.set_facecolor('xkcd: white')
ax.grid()
for i, txt in enumerate(final_names):
ax.annotate(txt, (years[i], rates[i]))
z = np.polyfit(years, rates, 1)
p = np.poly1d(z)
print(p)
plt.plot(years, p(years), color=(0.2, 0.4, 0.6, 0.6))
plt.show()
# print(my_movie.script_map)
# print(is_disney_corpus_biased())
# arr2 = [[name, get_meaning_score(name)] for name in utils.names]
# print (sorted(arr2, key=lambda x:x[1], reverse=True))
#
# print (sorted(arr, key=lambda x:x[1], reverse=True))
# print (name, is_movie_feminist(name))
# # model.save("frozen_model.bin")
# print(is_disney_corpus_biased())