/
lstm_dec.py
193 lines (142 loc) · 6.11 KB
/
lstm_dec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import numpy as np
import theano.tensor as T
from theano import function, shared, scan, pp, scan_module
from theano.tensor.nnet import sigmoid, softmax
from misc import random_weight_matrix
# # For debugging:
# from theano import config
# config.exception_verbosity = 'high'
rng = np.random
class LSTMDec:
def __init__(self, hdim, outdim, alpha=.005, rho=.0001, rseed=10):
# Dimensions
self.hdim = hdim
self.outdim = outdim
self.out_end = outdim # the end token
# Parameters
np.random.seed(rseed)
# Learning rate
self.alpha = alpha
# Regularization
self.rho = rho
## Theano stuff
# Params as theano.shared matrices
# W: times character-vector, U: times previous-hidden-vector
# i: input, f: forget, o: output, c: new-cell
self.Ui = shared(random_weight_matrix(hdim, hdim), name='Ui')
self.Uf = shared(random_weight_matrix(hdim, hdim), name='Uf')
self.Uo = shared(random_weight_matrix(hdim, hdim), name='Uo')
self.Uc = shared(random_weight_matrix(hdim, hdim), name='Uc')
self.U = shared(random_weight_matrix(outdim, hdim), name='U')
self.b = shared(np.zeros([outdim, 1]), name='b', broadcastable=(False, True))
self.params = [self.Ui, self.Uf, self.Uo, self.Uc, self.U, self.b]
self.vparams = [0.0*param.get_value() for param in self.params]
# # symbolic generate
# ch_prev = T.vector('ch_prev')
# self.generate_function = function([ch_prev], self.symbolic_generate(ch_prev))
# print 'done compiling'
def reset_grads(self):
"""Resets all grads to zero (maintaining shape!)"""
for dparam in self.dparams:
dparam.set_value(0.0 * dparam.get_value())
def lstm_timestep(self, y_t, old_cost, ch_prev):
"""calculates info to pass to next time step.
ch_prev is a vector of size 2*hdim"""
y_filtered_ind = T.ge(y_t, 0).nonzero()[0]
y_filtered = y_t[y_filtered_ind]
# break up into c and h
c_prev = ch_prev[:self.hdim]#T.vector('c_prev')
h_prev = ch_prev[self.hdim:]#T.vector('h_prev')
# gates (input, forget, output)
i_t = sigmoid(T.dot(self.Ui, h_prev))
f_t = sigmoid(T.dot(self.Uf, h_prev))
o_t = sigmoid(T.dot(self.Uo, h_prev))
# new memory cell
c_new_t = T.tanh(T.dot(self.Uc, h_prev))
# final memory cell
c_t = f_t * c_prev + i_t * c_new_t
# final hidden state
h_t = o_t * T.tanh(c_t)
# Input vector for softmax
theta_t = T.dot(self.U, h_t) + self.b
# Softmax prob vector
y_hat_t = softmax(theta_t.T).T
# Softmax wraps output in another list, why??
# (specifically it outputs a 2-d row, not a 1-d column)
# y_hat_t = y_hat_t[0]
# Compute new cost # TODO
cost = T.sum(-T.log(y_hat_t[y_filtered, y_filtered_ind]))
new_cost = old_cost + cost
# final joint state
ch_t = T.concatenate([c_t, h_t])
return new_cost, ch_t
def reg_updates_cost(self):
"""returns list of param updates and cost due to regularization"""
param_values = [param.get_value() for param in self.params]
updates = [self.rho * param if len(param.shape) > 1 else 0 * param for param in param_values]
reg_cost = 0.5 * self.rho * (np.sum(np.sum(param**2) for param in param_values if len(param.shape) > 1))
return (updates, reg_cost)
def symbolic_f_prop(self, ys, h_prev):
"""returns symbolic variable based on ys and h_prev."""
# Make sure all the examples in ys have the same length by this point
# (by padding with -1)
# ys = np.array(ys)
results, updates = scan(fn = self.lstm_timestep,
outputs_info = [np.float64(0.0), h_prev],
sequences=ys)
# Return the cost (index 0) at the most recent timestep (-1)
return results[0][-1]
def symbolic_b_prop(self, cost_final):
new_dparams = []
for param in self.params:
new_dparams.append(T.grad(cost_final, param))
return new_dparams
def lstm_output(self, y_prev, ch_prev):
"""calculates info to pass to next time step.
ch_prev is a vector of size 2*hdim"""
c_prev = ch_prev[:self.hdim]#T.vector('c_prev')
h_prev = ch_prev[self.hdim:]#T.vector('h_prev')
# gates (input, forget, output)
i_t = sigmoid(T.dot(self.Ui, h_prev))
f_t = sigmoid(T.dot(self.Uf, h_prev))
o_t = sigmoid(T.dot(self.Uo, h_prev))
# new memory cell
c_new_t = T.tanh(T.dot(self.Uc, h_prev))
# final memory cell
c_t = f_t * c_prev + i_t * c_new_t
# final hidden state
h_t = o_t * T.tanh(c_t)
# Input vector for softmax
theta_t = T.dot(self.U, h_t) + self.b
# Softmax prob vector
y_hat_t = softmax(theta_t.T).T
# Softmax wraps output in another list, why??
# (specifically it outputs a 2-d row, not a 1-d column)
# y_hat_t = y_hat_t[0]
# Compute new cost
out_label = T.argmax(y_hat_t)
# final joint state
ch_t = T.concatenate([c_t, h_t])
return (out_label, ch_t), scan_module.until(T.eq(out_label, self.out_end))
def symbolic_generate(self, h_prev):
"""generate ys from a given h_prev"""
results, updates = scan(fn = self.lstm_output,
outputs_info = [np.int64(0), h_prev],
n_steps = 50)
return results[0]
# def generate(self, ch_prev):
# return self.generate_function(ch_prev)
if __name__ == '__main__':
print 'Sanity check'
ld = LSTMDec(10,10,10)
# ys = [1,2,3,4]
# ch_prev = np.ones(2*ld.hdim)
# cost_final = ld.f_prop(ys, ch_prev)
# print cost_final
# ld.b_prop(ys, ch_prev)
# print 'printing dparams'
# for dparam in ld.dparams:
# print dparam.get_value()
print 'testing generate'
print ld.generate(np.ones(2*ld.hdim))
print 'done testing'