class Decoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V = vocab_size D = wordvec_size H = hidden_size rn = np.random.randn embed_w = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_w) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params = [] self.grads = [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads def forward(self, xs, h): self.lstm.set_state(h) out = self.embed.forward(xs) out = self.lstm.forward(out) score = self.affine.forward(out) return score def backward(self, dscore): dout = self.affine.backward(dscore) dout = self.lstm.backward(dout) dout = self.embed.backward(dout) dh = self.lstm.dh return dh def generate(self, h, start_id, sample_size): sampled = [] sample_id = start_id self.lstm.set_state(h) for _ in range(sample_size): x = np.array(sample_id).reshape((1, 1)) out = self.embed.forward(x) out = self.lstm.forward(out) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(int(sample_id)) return sampled
class TestTimeLSTM(unittest.TestCase): def setUp(self): Wx = np.array([ [ 9.72009451e-01, -4.97642862e-01, 6.45448952e-01, 8.10387855e-01, 1.13757673e+00, -5.27114694e-01, -9.08624540e-01, 1.61896844e+00, -1.16690977e+00, 3.93476226e-01, -6.04018422e-01, 5.67830817e-01 ], [ 6.68131790e-01, 6.40157016e-01, 6.90200961e-01, -1.39750585e+00, -4.89624070e-01, 8.99789953e-01, 3.97067428e-04, 1.47459503e+00, -4.95030269e-01, -9.22541855e-01, -1.57352198e-01, -1.67160494e+00 ], [ 6.93508859e-01, -9.23177216e-01, -4.83511551e-01, -1.18675890e+00, -7.35505045e-01, -1.61403611e+00, -2.76067694e-02, -2.48294747e-01, 1.14474446e+00, 1.86354309e-01, -1.73018002e+00, -4.82520536e-03 ] ]) Wh = np.array([ [ -0.88480318, 1.09509583, 0.55657863, -0.35096014, 0.18572107, 0.83823659, -0.44063768, -0.80897913, 0.35752315, 1.65812611, 1.40425671, 1.52519905 ], [ -0.22279229, 1.16363656, -0.47632291, -0.16436909, -2.16120359, 0.28362134, 0.01817155, 0.04836914, -0.30831619, -2.0992645 , -0.07302497, -0.72868125 ], [ -1.40551611, 2.12755955, 1.76232202, 2.15703084, -1.87387492, 1.22755896, -0.84271588, 1.07860737, -0.35473314, -0.86293879, 1.67287773, 0.41575087 ] ]) b = np.array([ 0.46861655, 0.15954682, 0.38782221, 1.00791178, -0.38322573, 0.83138721, 0.98675017, -0.83388618, 1.14392808, 0.37846653, 0.47617248, -1.8035631 ]) self.time_lstm = TimeLSTM(Wx, Wh, b) self.xs = np.array([ [ [ 0.71755849, 0.60697008, -0.62888378], [-0.49626568, -0.4748135 , 1.75968249], [-0.10438423, 0.28487314, 0.63474513], [-0.9923244 , 0.45072551, -1.64868359], [ 1.46760434, 0.35565694, -0.66870418], [ 0.35348356, -0.93987496, -2.87130379], [-0.29246176, -1.37729218, -0.67958982], [-1.4001965 , 0.55946231, 0.69675162], [-1.79238525, 1.57951988, 1.19779083], [ 0.87291494, 0.78168426, -0.78577742], [ 0.07307044, -1.61895973, 0.9379243 ], [ 1.99015425, 0.68183783, 1.77750001] ], [ [-0.17596061, 0.81663486, 0.04359994], [-0.92350641, 0.45340969, 1.43348315], [ 2.33693572, 1.7515518 , -1.3666055 ], [ 0.41772987, 0.24850998, 0.03112925], [-0.22745121, -0.22542461, -0.31234374], [-0.49538611, -0.44607479, -0.0899601 ], [-0.94219443, 1.00697691, -0.15114066], [-0.08454425, -0.13979634, -0.3159493 ], [-0.73268381, -0.15586239, 2.03090773], [-1.20719972, 0.2390794 , 1.85456855], [ 0.67798489, -1.25981493, 0.7688309 ], [-1.02414315, 1.65732408, -0.29402155] ], [ [ 0.07146087, -0.62179875, 0.3156025 ], [ 0.44587887, 0.71639604, 0.20851427], [ 1.06707963, 1.05846152, 1.49543497], [ 0.30010103, 1.20631821, 0.39232967], [-0.25315554, -0.32391953, -0.75328256], [-0.62199252, -1.39301922, 1.83188775], [-0.89011615, -0.5340496 , 0.93040961], [ 0.42915033, -0.17455902, 0.29048757], [-1.15432513, -0.29427616, -0.37391368], [ 0.97202347, -2.12827099, 1.22032467], [ 1.48975681, -1.05964565, -0.64436522], [-0.81431589, -0.93004337, -0.10522209] ] ]) def test_state(self): h = np.random.randn(7, 7) self.time_lstm.set_state(h) assert_array_equal(h, self.time_lstm.h) self.time_lstm.reset_state() self.assertEqual(None, self.time_lstm.h) self.assertEqual(None, self.time_lstm.c) def test_forward(self): hs = self.time_lstm.forward(self.xs) assert_almost_equal(np.array([ [ [ 0.5301528 , 0.4281083 , 0.31726667], [-0.15125458, 0.11109135, -0.81322926], [-0.9154287 , 0.03073275, -0.7118544 ], [ 0.01903052, 0.10519677, 0.51448697], [ 0.31846583, 0.43916756, 0.15142874], [ 0.6307919 , 0.56641424, 0.12767577], [ 0.74492615, 0.41010588, 0.781526 ], [-0.21825846, 0.2604803 , 1.3048488 ], [-0.7141188 , -0.24568337, 1.0162368 ], [ 0.40743738, -0.7485301 , 0.15660143], [ 0.6559933 , -0.04994468, -0.91404927], [ 0.16290103, 0.7862598 , -1.0801914 ] ], [ [-0.24008924, -0.39829323, 0.67160046], [-0.40160158, -0.37408042, -0.06064677], [ 0.12641826, 0.9246878 , 0.01439308], [ 0.45478454, -0.16197163, 0.47862712], [ 0.6539599 , -0.21339986, 0.7782586 ], [ 0.66772085, -0.29024592, 1.0482595 ], [ 0.58890504, -0.7002888 , 1.2171594 ], [ 0.4728623 , -0.85064125, 0.78665596], [ 0.09176779, -0.38400438, -0.5185094 ], [-0.8482741 , -0.03784448, -0.98402894], [-0.09456029, 0.10074405, -0.8739411 ], [-1.0006421 , -0.1893259 , 0.72118574]], [ [ 0.6532587 , -0.03446204, -0.22798873], [ 0.00191322, 0.11583798, 0.51861256], [-0.10508967, -0.8653865 , -0.27910316], [-0.6636366 , 0.6512917 , 0.29458234], [ 0.60504967, -0.2187419 , 0.5291746 ], [ 0.59398663, -0.07645915, -0.9416629 ], [-0.5550622 , -0.00184366, -0.91676426], [-0.84544504, 0.24613576, -0.74686164], [-0.6791576 , -0.01744348, 0.19078271], [ 0.4424824 , 0.04942419, -0.8943097 ], [ 0.83585304, 0.24170044, -0.44091615], [ 0.53841674, 0.06517641, 0.5635664 ] ] ]), hs) def test_backward(self): dhs = self.time_lstm.forward(self.xs) dxs = self.time_lstm.backward(dhs) assert_almost_equal(np.array([ [ [ 0.13398582, -0.14692113, -0.14036195], [ 0.07588957, -0.23084015, 0.13396417], [ 0.01643903, -0.13937211, 0.0455663 ], [ 0.01059901, -0.25425726, -0.11526247], [ 0.41907835, -0.21496703, -0.29487285], [ 0.10036876, -0.02304271, -0.00336768], [-0.03362638, -0.25355554, -0.00466897], [ 0.0426382 , -0.17100458, 0.0088986 ], [ 0.15800337, -0.03393397, -0.05257138], [-0.38284487, 0.12754066, 0.06260979], [-0.11011545, -0.14179969, 0.05320745], [ 0.11040838, -0.26075205, 0.00355089] ], [ [-0.00721848, 0.10863397, -0.02290008], [ 0.02448554, 0.42779642, -0.6901682 ], [-0.13468881, -0.00852101, -0.24757166], [ 0.46165678, 0.28746065, -0.08837437], [-0.061256 , -0.01779122, 0.16727453], [ 0.21336646, -0.27075866, 0.00212137], [ 0.4284778 , -0.04371689, -0.09592394], [-0.03456894, -0.07942928, 0.07711951], [ 0.06563383, -0.18600363, -0.01489557], [ 0.031198 , -0.13482623, 0.01405822], [ 0.05913948, -0.24915774, 0.00096152], [ 0.04424267, -0.10381597, 0.01898143] ], [ [ 0.06298973, -0.22444938, 0.13032934], [-0.04759881, 0.0338732 , 0.05039414], [-0.105564 , 0.07625255, 0.03889947], [-0.146558 , -0.05223962, 0.02283608], [ 0.11453921, 0.03403844, -0.05411878], [ 0.07570819, -0.3297297 , -0.05588694], [ 0.06549456, -0.29991058, 0.0459515 ], [ 0.05651642, -0.13871697, 0.01838844], [ 0.33854613, -0.6947379 , -0.37949783], [-0.14612219, -0.2425579 , 0.01918346], [-0.07906907, 0.05820642, -0.09164417], [ 0.19223884, -0.36523458, -0.4367506 ] ] ]), dxs)
class PeekyDecoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V = vocab_size D = wordvec_size H = hidden_size rn = np.random.randn embed_w = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(H + D, 4 * H) / np.sqrt(H + D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H + H, V) / np.sqrt(H + H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_w) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params = [] self.grads = [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads self.cache = None def forward(self, xs, h): N, T = xs.shape N, H = h.shape self.lstm.set_state(h) out = self.embed.forward(xs) hs = np.repeat(h, T, axis=0).reshape(N, T, H) out = np.concatenate((hs, out), axis=2) out = self.lstm.forward(out) out = np.concatenate((hs, out), axis=2) score = self.affine.forward(out) self.cache = H return score def backward(self, dscore): H = self.cache dout = self.affine.backward(dscore) dout = dout[:, :, H:] dhs0 = dout[:, :, :H] dout = self.lstm.backward(dout) dembed = dout[:, :, H:] dhs1 = dout[:, :, :H] self.embed.backward(dembed) dhs = dhs0 + dhs1 dh = self.lstm.dh + np.sum(dhs, axis=1) return dh def generate(self, h, start_id, sample_size): sampled = [] char_id = start_id self.lstm.set_state(h) H = h.shape[1] peeky_h = h.reshape(1, 1, H) for _ in range(sample_size): x = np.array([char_id]).reshape((1, 1)) out = self.embed.forward(x) out = np.concatenate((peeky_h, out), axis=2) out = self.lstm.forward(out) out = np.concatenate((peeky_h, out), axis=2) score = self.affine.forward(out) char_id = np.argmax(score.flatten()) sampled.append(char_id) return sampled