-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
427 lines (291 loc) · 11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
# coding: utf-8
# # The Office Neural Network
#
# ### The goal of this notebook is to create a Recurrent Neural Net in order to generate new scripts of The Office
#
# ##### The first step is going to be condensing this data frame down into a .txt file
# In[153]:
import pandas as pd
import numpy as np
import os
import pickle
import helpers
import warnings
import tensorflow as tf
from datetime import datetime
from tensorflow.keras.callbacks import TensorBoard
# In[136]:
df = pd.read_csv('cleaned-data.csv')
# In[137]:
df.head(10)
# Here I create a helper function that given a dataframe will condense it into a string
# In[138]:
file = ""
for i in range(len(df['id'])):
file += str(df['speaker'].iloc[i] + ': ' + str(df['line_text'].iloc[i])) + '\n'
# In[139]:
print(file[0:500])
# now we write this text data to a file so we can reopen it earlier without having to rerun code!
# In[140]:
script = open('./data/script.txt', 'w')
script.write(str(file.encode("utf-8")))
script.close()
# In[141]:
from collections import Counter
def create_lookup_tables(text):
"""
Create lookup tables for vocabulary
:param text: The text of tv scripts split into words
:return: A tuple of dicts (vocab_to_int, int_to_vocab)
"""
# TODO: Implement Function
word_counts = Counter(text)
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
return (vocab_to_int, int_to_vocab)
# In[142]:
def token_lookup():
"""
Generate a dict to turn punctuation into a token.
:return: Tokenize dictionary where the key is the punctuation and the value is the token
"""
tokenized_text = {
'.':'<PERIOD>',
',':'<COMMA>',
'"':'<QUOTATION_MARK>',
';':'<SEMICOLON>',
'!':'<EXCLAMATION_MARK>',
'?':'<QUESTION_MARK>',
'(':'<LEFT_PAREN>',
')':'<RIGHT_PAREN>',
'--':'<DASH>',
'\n':'<RETURN>'
}
return tokenized_text
# In[143]:
helpers.preprocess_and_save_data('./data/script.txt', token_lookup, create_lookup_tables)
# In[146]:
int_text, vocab_to_int, int_to_vocab, token_dict = helpers.load_preprocess()
# In[155]:
def get_inputs():
"""
Create TF Placeholders for input, targets, and learning rate.
:return: Tuple (input, targets, learning rate)
"""
input = tf.placeholder(tf.int32, [None, None], name='input')
targets = tf.placeholder(tf.int32, [None, None], name='targets')
learning_rate = tf.placeholder(tf.float32, name='learning_rate')
return(input, targets, learning_rate)
# ## RNN TIME!!!
# In[156]:
def get_init_cell(batch_size, rnn_size):
"""
Create an RNN Cell and initialize it.
:param batch_size: Size of batches
:param rnn_size: Size of RNNs
:return: Tuple (cell, initialize state)
"""
lstm_layers = 2
# Basic LSTM cell
lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
# Stack up multiple LSTM layers, for deep learning
cell = tf.contrib.rnn.MultiRNNCell([lstm] * lstm_layers)
# Getting an initial state of all zeros
initial_state = tf.identity(cell.zero_state(batch_size, tf.float32), name='initial_state')
return(cell, initial_state)
# In[157]:
def get_embed(input_data, vocab_size, embed_dim):
"""
Create embedding for <input_data>.
:param input_data: TF placeholder for text input.
:param vocab_size: Number of words in vocabulary.
:param embed_dim: Number of embedding dimensions
:return: Embedded input.
"""
embedding_weight = tf.Variable(tf.truncated_normal((vocab_size, embed_dim), stddev = 0.01))
embed_layer = tf.nn.embedding_lookup(embedding_weight, input_data)
return embed_layer
# In[158]:
def build_rnn(cell, inputs):
"""
Create a RNN using a RNN Cell
:param cell: RNN Cell
:param inputs: Input text data
:return: Tuple (Outputs, Final State)
"""
outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype = tf.float32)
final_state = tf.identity(final_state, name='final_state')
return outputs, final_state
# In[159]:
def build_nn(cell, rnn_size, input_data, vocab_size):
"""
Build part of the neural network
:param cell: RNN cell
:param rnn_size: Size of rnns
:param input_data: Input data
:param vocab_size: Vocabulary size
:return: Tuple (Logits, FinalState)
"""
embed = get_embed(input_data, vocab_size, rnn_size)
outputs, final_state = build_rnn(cell, embed)
logits = tf.contrib.layers.fully_connected(outputs,
vocab_size,
weights_initializer = tf.truncated_normal_initializer(stddev = 0.01),
activation_fn=None)
return logits, final_state
def get_batches(int_text, batch_size, seq_length):
"""
Return batches of input and target
:param int_text: Text with the words replaced by their ids
:param batch_size: The size of batch
:param seq_length: The length of sequence
:return: Batches as a Numpy array
"""
num_items = len(int_text)
num_batches = num_items // (batch_size * seq_length)
batches = np.zeros(shape=(num_batches, 2, batch_size, seq_length), dtype=np.int32)
for item in range(num_batches):
for batch in range(batch_size):
start_x = batch * batch_size * seq_length + item * seq_length
end_x = start_x + seq_length
if end_x < num_items:
start_y = start_x + 1
end_y = start_y + seq_length
batches[item, 0, batch, :] = int_text[start_x:end_x]
batches[item, 1, batch, :] = int_text[start_y:end_y]
return batches
# In[161]:
# Number of Epochs
num_epochs = 125
# Batch Size
batch_size = 512
# RNN Size
rnn_size = 1024
# Sequence Length
seq_length = 30
# Learning Rate
learning_rate = 0.01
# Show stats for every n number of batches
show_every_n_batches = 100
"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
save_dir = './save'
# In[162]:
from tensorflow.contrib import seq2seq
train_graph = tf.Graph()
with train_graph.as_default():
vocab_size = len(int_to_vocab)
input_text, targets, lr = get_inputs()
input_data_shape = tf.shape(input_text)
cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size)
# Probabilities for generating words
probs = tf.nn.softmax(logits, name='probs')
# Loss function
cost = seq2seq.sequence_loss(
logits,
targets,
tf.ones([input_data_shape[0], input_data_shape[1]]))
# Optimizer
optimizer = tf.train.AdamOptimizer(lr)
# Gradient Clipping
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients]
train_op = optimizer.apply_gradients(capped_gradients)
# In[ ]:
batches = get_batches(int_text, batch_size, seq_length)
now = datetime.now()
with tf.Session(graph=train_graph) as sess:
sess.run(tf.global_variables_initializer())
for epoch_i in range(num_epochs):
state = sess.run(initial_state, {input_text: batches[0][0]})
for batch_i, (x, y) in enumerate(batches):
feed = {
input_text: x,
targets: y,
initial_state: state,
lr: learning_rate}
train_loss, state, _ = sess.run([cost, final_state, train_op], feed)
NAME = "epoch-{}-batch-{}-{}".format(epoch_i, batch_i, now.strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(log_dir='logs/main/{}'.format(NAME))
# Show every <show_every_n_batches> batches
if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f}'.format(
epoch_i,
batch_i,
len(batches),
train_loss))
# Save Model
saver = tf.train.Saver()
saver.save(sess, save_dir)
print('Model Trained and Saved')
# In[ ]:
helpers.save_params((seq_length, save_dir))
# In[ ]:
_, vocab_to_int, int_to_vocab, token_dict = helpers.load_preprocess()
seq_length, load_dir = helpers.load_params()
# In[ ]:
def get_tensors(loaded_graph):
"""
Get input, initial state, final state, and probabilities tensor from <loaded_graph>
:param loaded_graph: TensorFlow graph loaded from file
:return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
"""
InputTensor = loaded_graph.get_tensor_by_name("input:0")
InitialStateTensor = loaded_graph.get_tensor_by_name("initial_state:0")
FinalStateTensor = loaded_graph.get_tensor_by_name("final_state:0")
ProbsTensor = loaded_graph.get_tensor_by_name("probs:0")
return InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor
# In[ ]:
def pick_word(probabilities, int_to_vocab):
"""
Pick the next word in the generated text
:param probabilities: Probabilites of the next word
:param int_to_vocab: Dictionary of word ids as the keys and words as the values
:return: String of the predicted word
"""
word_id = np.random.choice(len(probabilities), p=probabilities)
return int_to_vocab[word_id]
# In[ ]:
gen_length = 500
# homer_simpson, moe_szyslak, or Barney_Gumble
prime_word = 'michael'
"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
# Load saved model
loader = tf.train.import_meta_graph(load_dir + '.meta')
loader.restore(sess, load_dir)
# Get Tensors from loaded model
input_text, initial_state, final_state, probs = get_tensors(loaded_graph)
# Sentences generation setup
gen_sentences = [prime_word + ':']
prev_state = sess.run(initial_state, {input_text: np.array([[1]])})
# Generate sentences
for n in range(gen_length):
# Dynamic Input
dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
dyn_seq_length = len(dyn_input[0])
# Get Prediction
probabilities, prev_state = sess.run(
[probs, final_state],
{input_text: dyn_input, initial_state: prev_state})
pred_word = pick_word(probabilities[0][dyn_seq_length - 1], int_to_vocab)
#pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)
gen_sentences.append(pred_word)
# Remove tokens
tv_script = ' '.join(gen_sentences)
for key, token in token_dict.items():
ending = ' ' if key in ['\n', '(', '"'] else ''
tv_script = tv_script.replace(' ' + token.lower(), key)
tv_script = tv_script.replace('\n ', '\n')
tv_script = tv_script.replace('( ', '(')
list_tv_script = tv_script.split('\\n')
print(tv_script)
pred_script = open('./data/new_script_08_31_2018.txt', 'w')
pred_script.write(tv_script)
pred_script.close()