class Encoder: def __init__(self, vocab_size: int, wordvec_size: int, hidden_size: int) -> None: embed_W = (np.random.randn(vocab_size, wordvec_size) / 100).astype(float) lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) / np.sqrt(wordvec_size)).astype(float) lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) / np.sqrt(hidden_size)).astype(float) lstm_b = np.zeros(4 * hidden_size).astype(float) self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False) self.params = self.embed.params + self.lstm.params self.grads = self.embed.grads + self.lstm.grads self.hs = None def forward(self, xs: np.ndarray) -> np.ndarray: xs = self.embed.forward(xs) hs = self.lstm.forward(xs) self.hs = hs return hs[:, -1, :] def backward(self, dh: np.ndarray) -> None: dhs = np.zeros_like(self.hs) dhs[:, -1, :] = dh dout = self.lstm.backward(dhs) dout = self.embed.backward(dout) # return None return dout
def __init__(self, vocab_size: int=10000, wordvec_size: int=100, hidden_size: int=100, dropout_ratio: float=1.0) -> None: embed_W = (np.random.randn(vocab_size, wordvec_size)/100).astype(float) lstm_Wx1 = (np.random.randn(wordvec_size, 4*hidden_size)/np.sqrt(wordvec_size)).astype(float) lstm_Wh1 = (np.random.randn(hidden_size, 4*hidden_size)/np.sqrt(hidden_size)).astype(float) lstm_b1 = np.zeros(4*hidden_size).astype(float) lstm_Wx2 = (np.random.randn(wordvec_size, 4*hidden_size)/np.sqrt(wordvec_size)).astype(float) lstm_Wh2 = (np.random.randn(hidden_size, 4*hidden_size)/np.sqrt(hidden_size)).astype(float) lstm_b2 = np.zeros(4*hidden_size).astype(float) affine_b = np.zeros(vocab_size).astype(float) self.layers = [ TimeEmbedding(embed_W), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True), TimeDropout(dropout_ratio), TimeAffine(embed_W.T, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layers = [self.layers[2], self.layers[4]] self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]] self.params = [] self.grads = [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size: int, wordvec_size: int, hidden_size: int) -> None: embed_W = (np.random.randn(vocab_size, wordvec_size) / 100).astype(float) lstm_Wx = ( np.random.randn(wordvec_size + hidden_size, 4 * hidden_size) / np.sqrt(wordvec_size)).astype(float) lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) / np.sqrt(hidden_size)).astype(float) lstm_b = np.zeros(4 * hidden_size).astype(float) affine_W = (np.random.randn(hidden_size + hidden_size, vocab_size) / np.sqrt(hidden_size)).astype(float) affine_b = np.zeros(vocab_size).astype(float) self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params = [] self.grads = [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads self.cache = None
def __init__(self, vocab_size: int, wordvec_size: int, hidden_size: int) -> None: embed_W = (np.random.randn(vocab_size, wordvec_size) / 100).astype(float) lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) / np.sqrt(wordvec_size)).astype(float) lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) / np.sqrt(hidden_size)).astype(float) lstm_b = np.zeros(4 * hidden_size).astype(float) self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False) self.params = self.embed.params + self.lstm.params self.grads = self.embed.grads + self.lstm.grads self.hs = None
class Decoder: def __init__(self, vocab_size: int, wordvec_size: int, hidden_size: int) -> None: embed_W = (np.random.randn(vocab_size, wordvec_size) / 100).astype(float) lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) / np.sqrt(wordvec_size)).astype(float) lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) / np.sqrt(hidden_size)).astype(float) lstm_b = np.zeros(4 * hidden_size).astype(float) affine_W = (np.random.randn(hidden_size, vocab_size) / np.sqrt(hidden_size)).astype(float) affine_b = np.zeros(vocab_size).astype(float) self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params = [] self.grads = [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads def forward(self, xs: np.ndarray, h: np.ndarray) -> np.ndarray: self.lstm.set_state(h) out = self.embed.forward(xs) out = self.lstm.forward(out) score = self.affine.forward(out) return score def backward(self, dscore: np.ndarray) -> np.ndarray: dout = self.affine.backward(dscore) dout = self.lstm.backward(dout) dout = self.embed.backward(dout) dh = self.lstm.dh return dh def generate(self, h: np.ndarray, start_id: int, sample_size: int) -> List[int]: sampled = [] sample_id = start_id self.lstm.set_state(h) for _ in range(sample_size): x = np.array(sample_id).reshape((1, 1)) out = self.embed.forward(x) out = self.lstm.forward(out) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(int(sample_id)) return sampled
class AttentionDecoder: def __init__(self, vocab_size: int, wordvec_size: int, hidden_size) -> None: embed_W = (np.random.randn(vocab_size, wordvec_size) / 100).astype(float) lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) / np.sqrt(wordvec_size)).astype(float) lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) / np.sqrt(hidden_size)).astype(float) lstm_b = np.zeros(4 * hidden_size).astype(float) affine_W = (np.random.randn(2 * hidden_size, vocab_size) / np.sqrt(2 * hidden_size)).astype(float) affine_b = np.zeros(vocab_size).astype(float) self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] self.params = [] self.grads = [] for layer in layers: self.params += layer.params self.grads += layer.grads def forward(self, xs: np.ndarray, enc_hs: np.ndarray) -> np.ndarray: h = enc_hs[:, -1] self.lstm.set_state(h) out = self.embed.forward(xs) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) return score def backward(self, dscore: np.ndarray) -> np.ndarray: dout = self.affine.backward(dscore) N, T, H2 = dout.shape H = H2 // 2 dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:] denc_hs, ddec_hs1 = self.attention.backward(dc) ddec_hs = ddec_hs0 + ddec_hs1 dout = self.lstm.backward(ddec_hs) denc_hs[:, -1] += self.lstm.dh self.embed.backward(dout) return denc_hs def generate(self, enc_hs: np.ndarray, start_id: int, sample_size: int) -> List[int]: sampled = [] sample_id = start_id h = enc_hs[:, -1] self.lstm.set_state(h) for _ in range(sample_size): x = np.array([sample_id]).reshape((1, 1)) out = self.embed.forward(x) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(sample_id) return sampled
class PeekyDecoder: def __init__(self, vocab_size: int, wordvec_size: int, hidden_size: int) -> None: embed_W = (np.random.randn(vocab_size, wordvec_size) / 100).astype(float) lstm_Wx = ( np.random.randn(wordvec_size + hidden_size, 4 * hidden_size) / np.sqrt(wordvec_size)).astype(float) lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) / np.sqrt(hidden_size)).astype(float) lstm_b = np.zeros(4 * hidden_size).astype(float) affine_W = (np.random.randn(hidden_size + hidden_size, vocab_size) / np.sqrt(hidden_size)).astype(float) affine_b = np.zeros(vocab_size).astype(float) self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params = [] self.grads = [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads self.cache = None def forward(self, xs: np.ndarray, h: np.ndarray) -> np.ndarray: N, T = xs.shape N, H = h.shape self.lstm.set_state(h) out = self.embed.forward(xs) hs = np.repeat(h, T, axis=0).reshape(N, T, H) out = np.concatenate((hs, out), axis=2) out = self.lstm.forward(out) out = np.concatenate((hs, out), axis=2) score = self.affine.forward(out) self.cache = H return score def backward(self, dscore: np.ndarray) -> np.ndarray: H = self.cache dout = self.affine.backward(dscore) dout, dhs0 = dout[:, :, H:], dout[:, :, :H] dout = self.lstm.backward(dout) dembed, dhs1 = dout[:, :, H:], dout[:, :, :H] self.embed.backward(dembed) dhs = dhs0 + dhs1 dh = self.lstm.dh + np.sum(dhs, axis=1) return dh def generate(self, h: np.ndarray, start_id: int, sample_size: int) -> List[int]: sampled = [] sample_id = start_id self.lstm.set_state(h) H = h.shape[1] peeky_h = h.reshape(1, 1, H) for _ in range(sample_size): x = np.array([sample_id]).reshape((1, 1)) out = self.embed.forward(x) out = np.concatenate((peeky_h, out), axis=2) out = self.lstm.forward(out) out = np.concatenate((peeky_h, out), axis=2) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(sample_id) return sampled