-
Notifications
You must be signed in to change notification settings - Fork 0
/
RNN.py
347 lines (266 loc) · 9.58 KB
/
RNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
"""
My first implemenation of recursive neural net (RNN)
with (hopefully) LSTM architecture for character analysis
to do 07/10/2015
-implement a checking function
-multiprocessing -- scan function is inefficient but does not
need to be done in sequence
-LSTM -- new weight matrices, new scan implementations (blah!)
-before LSTM maybe do multi layer RNN?
"""
import theano
import theano.tensor as T
import numpy as np
from character_mapping import Character_Map
import time
try:
import cPickle as pickle
except:
import pickle
import os
from datetime import datetime
class RNNClass(object):
def __init__(self, nh, nx, ny):
"""
This is only set up for a single hidden layer
args:
nh is size of hidden layer vector
nx is the size of the input vector
ny is the size of the output vector (ny = nx in character example)
"""
self.wx = theano.shared(name='wx',
value=0.2 * np.random.uniform(-1.0, 1.0,
(nx, nh))
.astype(theano.config.floatX)) #input weights
self.wh = theano.shared(name='wh',
value=0.2 * np.random.uniform(-1.0, 1.0,
(nh, nh))
.astype(theano.config.floatX)) #hidden layer weights
self.wy = theano.shared(name='wy',
value=0.2 * np.random.uniform(-1.0, 1.0,
(nh, ny))
.astype(theano.config.floatX)) #output weights
self.bh = theano.shared(name='bh',
value=np.zeros(nh,
dtype=theano.config.floatX)) #hidden layer bias
self.by = theano.shared(name='b',
value=np.zeros(ny,
dtype=theano.config.floatX)) #output layer bias
self.h0 = theano.shared(name='h0',
value=np.zeros(nh,
dtype=theano.config.floatX)) #initial h vector
self.sequence_length = 15
def feed_through(self,x,h_tm1):
"""
t_step is the current time step. If t_step == 0, then we use self.h0
to feed through net.
basically copied from the theano tutorial
"""
h = T.tanh(T.dot(x,self.wx) + T.dot(h_tm1, self.wh) + self.bh)
y_hat = self.by + T.dot(h,self.wy)
y_guess = T.nnet.softmax(y_hat)
return h, y_guess
# def loss(self,x,y):
# """
# args:
# - x is a vector containing the first character of a sequence
# - y is a vector containing the last character of the sequence
# ***assuming constance sequence length****
# """
# [h, s], _ = theano.scan(fn=self.feed_through,
# sequences=x,
# outputs_info=[self.h0,None])
# return -T.mean(T.log(s)[T.arange(y.shape[0]), y])
def cross_entropy_loss(self, x, y):
"""
Cross entropy loss function. Average of cross entropy across a minibatch
"""
[h, s], _ = theano.scan(fn=self.feed_through,
sequences=x,
outputs_info=[self.h0,None])
y_guess = s[:,0,:]
return y*T.log(y_guess) + (1.0-y)*T.log(1.0-y_guess)
def sqr_diff_loss(self, x, y):
[h, s], _ = theano.scan(fn=self.feed_through,
sequences=x,
outputs_info=[self.h0,None])
y_guess = s[:,0,:]
return T.sum((y-y_guess)**2)
def save_param(self,pickle_file):
pickle_me = {
'param':[self.wx, self.wh, self.wy, self.bh, self.by, self.h0]
}
pickle.dump( pickle_me, open(pickle_file, 'wb') )
def load_param(self,pickle_file):
pickle_me = pickle.load(open(pickle_file,'rb'))
param = pickle_me['param']
self.wx, self.wh, self.wy, self.bh, self.by, self.h0 = param
# def train_no_index(self,training_data,learning_rate,n_epochs,mini_batch_size):
# """
# Right now using cross entropy loss function. This works, albeit very slowly.
# args:
# - training_data: inputs with ideal outputs
# - learning_rate
# - n_epochs: the number of epochs to train the NN for
# - mini_batch_size: the size of the mini batch to be used for SGD
# """
# train_x, train_y = training_data
# train_size_total = train_x.get_value(borrow=True).shape[0]
# n_train_batches = train_size_total/mini_batch_size
# x = T.matrix('x')
# y = T.matrix('y')
# xs = T.tensor3('xs')
# ys = T.itensor3('ys')
# # index = T.iscalar()
# cost = -T.mean(self.cross_entropy_loss(x,y))
# params = [self.wx, self.wh, self.wy, self.bh, self.by, self.h0]
# grads = T.grad(cost,params)
# updates = [(param, param-learning_rate*grad) for param, grad in zip(params,grads)]
# train_model = theano.function(
# inputs = [x,y],
# outputs = cost,
# updates = updates
# )
# train_x_val = train_x.get_value()
# train_y_val = train_y.get_value()
# print("function compiled\n\n")
# for j in xrange(n_epochs):
# sum_epoch = 0
# t1 = time.time()
# for i in xrange(n_train_batches):
# sum_mini_batch = 0
# t3 = time.time()
# x_slice = train_x_val[i*mini_batch_size: (i+1)*mini_batch_size]
# y_slice = train_y_val[i*mini_batch_size: (i+1)*mini_batch_size]
# xy_size = x_slice.shape[0]
# for h in xrange(xy_size):
# sum_mini_batch += train_model(x_slice[h], y_slice[h])
# print("Time for minibatch: {}".format(time.time()-t3))
# # print("Time making sum {}".format(time.time()-t4))
# if i % 30 == 0:
# print("Sum for minibatch number {} out of {}: {}".format(i,n_train_batches,sum_mini_batch))
# sum_epoch += sum_mini_batch
# print("Sum for this epoch: {:.3f}, took {:.3f} sec".format(sum_epoch, time.time()-t1))
# if j % 5 == 0:
# t2 = time.time()
# self.save_param("param_epoch{}.dat".format(i))
# print("Pickling epoch number {} took {:.3f} sec".format(j, time.time()-t2))
def train_index(self,training_data,learning_rate,n_epochs,mini_batch_size):
"""
Using a squared difference loss function now. I couldn't get
log loss function to work out for me. Dont know how that works.
args:
- training_data: inputs with ideal outputs
- learning_rate
- n_epochs: the number of epochs to train the NN for
- mini_batch_size: the size of the mini batch to be used for SGD
"""
foo = datetime.now()
param_folder = "param_{}-{}_{}:{}/".format(foo.day, foo.month, foo.hour, foo.minute)
os.mkdir(param_folder)
print("Saving initial parameters")
self.save_param("{}param_epoch{}.dat".format(param_folder,0))
# print("Using train function with indices")
train_x, train_y = training_data
train_size_total = train_x.get_value(borrow=True).shape[0]
n_train_batches = train_size_total/mini_batch_size
# x = T.matrix('x')
# y = T.matrix('y')
xs = T.tensor3('xs')
ys = T.tensor3('ys')
index = T.iscalar()
# cost = self.cross_entropy_loss(x,y)
results, _ = theano.scan(lambda xi, yi: self.cross_entropy_loss(xi,yi),
sequences = [xs,ys])
loss_fn = -T.mean(results) # loss must be a scalar value, not a matrix
params = [self.wx, self.wh, self.wy, self.bh, self.by, self.h0]
grads = T.grad(loss_fn,params)
updates = [(param, param-learning_rate*grad) for param, grad in zip(params,grads)]
train_model = theano.function(
inputs = [index],
outputs = loss_fn,
updates = updates,
givens = {
xs: train_x[index*mini_batch_size: (index+1)*mini_batch_size],
ys: train_y[index*mini_batch_size: (index+1)*mini_batch_size]
}
)
print("Function compiled!")
print("Training model")
for i in xrange(n_epochs):
t1 = time.time()
for index in xrange(n_train_batches):
t2 = time.time()
train_model(index)
if index % 30 == 0:
print("{} out of {} minibatches done, took ~ {:.3f}".format(index,n_train_batches,30*(time.time()-t2)))
print("Epoch number {}, took {:.3f} sec".format(i,time.time()-t1))
# if i % 2 == 0:
t2 = time.time()
self.save_param("{}param_epoch{}.dat".format(param_folder,i))
print("Pickling epoch number {} took {:.3f} sec".format(i, time.time()-t2))
def sequence_guess(self,x_init,sequence_length):
"""
Given some x_init vector, this will generate a sequence of
characters, 'sequence_length' long.
runs x_init through the RNN
returns a vector containing the generated sequence
"""
x0 = T.vector('x0')
h0 = T.vector('h0')
h, y_intermediate = self.feed_through(x0, h0)
f1 = theano.function(inputs=[x0,h0],
outputs=[h, y_intermediate])
f2 = theano.function([x0],T.argmax(x0))
hi, yi = f1(x_init, self.h0.get_value())
# ys = [y[0]]
y_argmax = [f2(yi[0])]
for i in xrange(1, sequence_length):
hi, yi = f1(yi[0],hi)
# ys.append(yi[0])
y_argmax.append(f2(yi[0]))
return y_argmax
# def gen_random_sentence(self,x_init):
# """
# Run 'x_init' through the RNN, saving the y values at
# each 'time step'.
# """
# ys = []
# y, h = self.feed_through(x_init,self.h0)
# ys.append(y)
# for i in xrange(1,self.sequence_length):
# y, h = self.feed_through(y,h)
# ys.append(y)
# # ys = [y.eval() for y in ys]
# # ys_arg_max = [np.argmax(y) for y in ys]
# return ys
# def compile_gen_sentence(self):
# """
# compile a theano function that takes the initial x value
# and returns y vectors for each of the subsequent positions.
# """
# x = T.vector('x')
# y = self.gen_random_sentence(x)
# f = theano.function([x],y)
# return f
if __name__ == '__main__':
text_test = './../texts/melville.txt'
char_map_obj = Character_Map(text_test,'mapping.dat',overwrite=True, break_line=None)
char_map_obj.k_map()
x, y, shared_x, shared_y = char_map_obj.gen_x_and_y(filename=None)
# print(shared_x, shared_y.get_value().shape[0])
nh = 100
nx = len(char_map_obj.unique_char)
ny = nx
trainer = RNNClass(nh,nx,ny)
# jobs = []
# for i in xrange(2):
# p = multiprocessing.Process(target=trainer.train, args=((shared_x,shared_y),0.03,1000,10,))
# jobs.append(p)
# p.start()
# trainer.load_param('param_epoch95.dat')
trainer.train_index(training_data=(shared_x,shared_y),
learning_rate=0.01,
n_epochs=100,
mini_batch_size=1000)
# f = trainer.compile_gen_sentence()