def _forward(self, x_t, h_tm1, t=None): # (batsize, indim), (batsize, outdim) if self.dropout_in: x_t = self.dropout_in(x_t) if self.dropout_rec: h_tm1 = self.dropout_rec(h_tm1) if self.shared_dropout_rec: if self.shared_dropout_reccer is None: ones = q.var(torch.ones(h_tm1.size())).cuda(crit=h_tm1).v self.shared_dropout_reccer = [self.shared_dropout_rec(ones)] h_tm1 = torch.mul(h_tm1, self.shared_dropout_reccer[0]) h_t = self.apply_nncell(x_t, h_tm1, t=t) if self.zoneout: if self.zoner is None: self.zoner = q.var(torch.ones(h_t.size())).cuda(crit=h_t).v zoner = self.zoneout(self.zoner) h_t = torch.mul(1 - zoner, h_tm1) + torch.mul(zoner, h_t) if self.shared_zoneout: if self.shared_zoneouter is None: ones = q.var(torch.ones(h_t.size())).cuda(crit=h_t).v self.shared_zoneouter = [self.shared_zoneout(ones)] h_t = torch.mul(1 - self.shared_zoneouter[0], h_tm1) + torch.mul( self.shared_zoneouter[0], h_t) return h_t, h_t
def _forward(self, x_t, c_tm1, t=None): if self.dropout_in: x_t = self.dropout_in(x_t) if self.dropout_rec: c_tm1 = self.dropout_rec(c_tm1) if self.shared_dropout_rec: if self.shared_dropout_reccer is None: ones = q.var(torch.ones(c_tm1.size())).cuda(crit=c_tm1).v self.shared_dropout_reccer = [self.shared_dropout_rec(ones)] c_tm1 = torch.mul(c_tm1, self.shared_dropout_reccer[0]) y_t, c_t = self.apply_nncell(x_t, c_tm1, t=t) if self.zoneout: if self.zoner is None: self.zoner = q.var(torch.ones(c_t.size())).cuda(crit=c_t).v zoner = self.zoneout(self.zoner) c_t = torch.mul(1 - zoner, c_tm1) + torch.mul(zoner, c_t) if self.shared_zoneout: if self.shared_zoneouter is None: ones = q.var(torch.ones(c_t.size())).cuda(crit=c_t).v self.shared_zoneouter = [self.shared_zoneout(ones)] c_t = torch.mul(1 - self.shared_zoneouter[0], c_tm1) + torch.mul( self.shared_zoneouter[0], c_t) return y_t, c_t
def test_equivalent_to_qelos_masked(self): m = ScaledDotProductAttention(10, attn_dropout=0) refm = q.Attention().dot_gen().scale(10**0.5) Q = q.var(np.random.random((5, 1, 10)).astype("float32")).v K = q.var(np.random.random((5, 6, 10)).astype("float32")).v M = q.var( np.asarray([ [1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1], ])).v V = q.var(np.random.random((5, 6, 11)).astype("float32")).v ctx, atn = m(Q, K, V, attn_mask=(-1 * M + 1).byte().data.unsqueeze(1)) refatn = refm.attgen(K, Q, mask=M) refctx = refm.attcon(V, refatn) print(atn) print(refatn) self.assertTrue(np.allclose(atn.data.numpy(), refatn.data.numpy())) self.assertTrue(np.allclose(ctx.data.numpy(), refctx.data.numpy()))
def test_decoder_shape(self): wdic = "<MASK> a b c d e f g h i j k l m n o p".split() wdic = dict(zip(wdic, range(len(wdic)))) emb = q.WordEmb(10, worddic=wdic) m = q.AYNDecoder(emb, n_max_seq=7, n_layers=3, n_head=2, d_k=4, d_v=6, d_pos_vec=6, d_model=16, d_inner_hid=20, dropout=0) src_seq = q.var(np.random.randint(1, max(wdic.values()), (5, 7))).v src_seq_mask_starts = np.random.randint(1, 7, (5,), dtype="int64") src_seq_mask = np.ones_like(src_seq.data.numpy()) for i in range(5): src_seq_mask[i, :src_seq_mask_starts[i]] = 0 src_seq_mask = q.var(src_seq_mask).v src_seq.masked_fill_(src_seq_mask.byte(), 0) src_pos = q.var(np.arange(0, 7, dtype="int64")).v src_pos = src_pos.unsqueeze(0).repeat(5, 1) ctx = q.var(np.random.random((5, 8, 16)).astype("float32")).v ctx_seq_mask_starts = np.random.randint(1, 8, (5,), dtype="int64") ctx_seq_mask = np.ones((5, 8)) for i in range(5): ctx_seq_mask[i, :ctx_seq_mask_starts[i]] = 0 ctx_seq_mask = -1*q.var(ctx_seq_mask).v.byte()+1 out = m(src_seq, ctx, ctx_seq_mask) print(out) self.assertEqual(out.size(), (5, 7, 16)) loss = out.sum() loss.backward()
def test_decoder_loss(): l = DecoderLoss() logprobs = -np.random.random((3, 5, 4)) gold = np.asarray([[1, 2, 3, 0, 0], [1, 1, 0, 0, 0], [3, 3, 3, 3, 3]]) logprobs = q.var(torch.FloatTensor(logprobs)).v gold = q.var(torch.LongTensor(gold)).v loss = l(logprobs, gold) print loss
def test_multigrad(self): class Module(nn.Module): def __init__(self): super(Module, self).__init__() self.one = nn.Linear(3, 3) self.two = nn.Linear(3, 3) def forward(self, x): return self.two(self.one(x)) net = Module() inp1 = q.var(torch.randn(3)).v inp2 = q.var(torch.randn(3)).v lossa = net(inp1).sum() lossa.backward() agrads = [] for p in net.parameters(): print(p.grad) agrads.append(p.grad.data.numpy() + 0) net.zero_grad() lossb = net(inp2).sum() lossb.backward() bgrads = [] for p in net.parameters(): print(p.grad) bgrads.append(p.grad.data.numpy() + 0) net.zero_grad() loss = net(inp2).sum() + net(inp1).sum() loss.backward() grads = [] for p in net.parameters(): print(p.grad) grads.append(p.grad.data.numpy() + 0) net.zero_grad() lossa = net(inp1).sum() lossa.backward() lossb = net(inp2).sum() lossb.backward() sgrads = [] for p in net.parameters(): print(p.grad) sgrads.append(p.grad.data.numpy() + 0) for a, b, t, s in zip(agrads, bgrads, grads, sgrads): self.assertTrue(np.allclose(a + b, t)) self.assertTrue(np.allclose(t, s)) print("qsdf")
def test_sameasbase(self): words = "inception earlgrey <MASK>" pred, mask = self.emb( q.var(torch.LongTensor([self.emb * x for x in words.split()])).v) pred = pred.data.numpy() gpred, msk = self.baseemb( q.var(torch.LongTensor([self.baseemb * x for x in words.split()])).v) gpred = gpred.data.numpy() self.assertTrue(np.allclose(pred, gpred))
def test_notasbase(self): words = "the his monkey key" pred, mask = self.emb( q.var(torch.LongTensor([self.emb * x for x in words.split()])).v) pred = pred.data.numpy() gpred, msk = self.baseemb( q.var(torch.LongTensor([self.baseemb * x for x in words.split()])).v) gpred = gpred.data.numpy() self.assertFalse(np.allclose(pred, gpred))
def test_sameasover(self): words = "the his monkey key" pred, msk = self.emb( q.var(torch.LongTensor([self.emb * x for x in words.split()])).v) pred = pred.data.numpy() gpred, _ = self.overemb( q.var(torch.LongTensor([self.overemb * x for x in words.split()])).v) gpred = gpred.data.numpy() self.assertTrue(np.allclose(pred, gpred))
def test_notasover(self): words = "inception earlgrey" pred, mask = self.emb( q.var(torch.LongTensor([self.emb * x for x in words.split()])).v) pred = pred.data.numpy() gpred, _ = self.overemb( q.var(torch.LongTensor([self.baseemb * x for x in words.split()])).v) gpred = gpred.data.numpy() self.assertFalse(np.allclose(pred, gpred))
def forward(self, src_seq, src_pos=None): # Word embedding look up enc_slf_attn_mask = None enc_input = self.src_word_emb(src_seq) if isinstance(enc_input, tuple) and len(enc_input) == 2: enc_input, enc_slf_attn_mask = enc_input if src_pos is None: src_pos = torch.arange(0, src_seq.size(1))\ .unsqueeze(0).repeat(src_seq.size(0), 1).long() src_pos = q.var(src_pos).v # Position Encoding addition pos_input = self.position_enc(src_pos) if not self.cat_pos_enc: enc_input = enc_input + pos_input # does the paper add position encodings? --> yes else: enc_input = torch.cat([enc_input, pos_input], 2) enc_outputs, enc_slf_attns = [], [] enc_output = enc_input # enc_slf_attn_mask = get_attn_padding_mask(src_seq, src_seq) for enc_layer in self.layer_stack: enc_output, enc_slf_attn = enc_layer( enc_output, slf_attn_mask=enc_slf_attn_mask) enc_outputs += [enc_output] enc_slf_attns += [enc_slf_attn] return enc_output # enc_outputs, enc_slf_attns
def number2charseq(x): dic = { 0: "_zero ", 1: "_one ", 2: "_two ", 3: "_three ", 4: "_four ", 5: "_five ", 6: "_six ", 7: "_seven ", 8: "_eight ", 9: "_nine " } acc = [] tocuda = False if x.is_cuda: x = x.cpu() tocuda = True for i in range(x.size(0)): word = x[i].data.numpy()[0] word = dic[word] word = map(lambda x: ord(x) if x is not " " else 0, word) acc.append(word) acc = np.asarray(acc) acc = q.var(torch.LongTensor(acc)).cuda(crit=tocuda).v return acc
def test_model(encoder, decoder, m, questions, queries, vnt): questions = q.var(questions[:10]).v queries = q.var(queries[:10]).v vnt = q.var(vnt[:10]).v # try encoder ctx, ctxmask, finalctx = encoder(questions) # print(ctx.size()) assert(ctx.size(0) == finalctx.size(0)) assert(ctx.size(1) == ctxmask.float().size(1)) assert(ctx.size(2) == finalctx.size(1)) maskedctx = ctx * ctxmask.unsqueeze(2).float() assert((ctx.norm(2) == maskedctx.norm(2)).data.numpy()[0]) # print(ctx.norm(2) - maskedctx.norm(2)) loss = finalctx.sum() loss.backward() encoder.zero_grad() ctx, ctxmask, finalctx = encoder(questions) loss = finalctx.sum() loss.backward() print("dry run of encoder didn't throw errors") # decoder.block.embedder = nn.Embedding(100000, 200, padding_idx=0) # decoder.block.smo = q.Stack( # q.argsave.spec(mask={"mask"}), # q.argmap.spec(0), # nn.Linear(200, 11075), # q.argmap.spec(0, mask=["mask"]), # q.LogSoftmax(), # q.argmap.spec(0), # ) # decoder.block.smo = None # try decoder cell for t in range(3): # ctx, ctxmask, finalctx = encoder(questions) decoder.block.core.reset_state() # ESSENTIAL !!! otherwise double .backward() error decoder.set_init_states(finalctx.detach()) decoder.block.zero_grad() outmaskt=vnt[:, t] # outmaskt=q.var(np.ones_like(vnt[:, t].data.numpy()).astype("int64")).v y_t = decoder.block(queries[:, t], ctx.detach(), ctxmask=ctxmask.detach(), t=t, outmask_t=outmaskt) loss = torch.max(y_t) print(loss) loss.backward() print("backward done") print("dry run of decoder cell didn't throw errors") q.embed()
def test_equivalent_to_qelos(self): m = ScaledDotProductAttention(10, attn_dropout=0) refm = q.Attention().dot_gen().scale(10**0.5) Q = q.var(np.random.random((5, 4, 10)).astype("float32")).v K = q.var(np.random.random((5, 6, 10)).astype("float32")).v V = q.var(np.random.random((5, 6, 11)).astype("float32")).v ctx, atn = m(Q, K, V) refatn = refm.attgen(K, Q) refctx = refm.attcon(V, refatn) print(atn) print(refatn) self.assertTrue(np.allclose(atn.data.numpy(), refatn.data.numpy())) self.assertTrue(np.allclose(ctx.data.numpy(), refctx.data.numpy()))
def test_bypass_stack(self): data = q.var(np.random.random((3, 5)).astype(dtype="float32")).v stack = q.Stack(q.Forward(5, 5), q.argsave.spec(a=0), q.Forward(5, 5), q.Forward(5, 5), q.argmap.spec(0, ["a"]), q.Lambda(lambda x, y: torch.cat([x, y], 1)), q.Forward(10, 7)) out = stack(data) print(out) self.assertEqual(out.size(), (3, 7))
def getvector(self, word): try: if isstring(word): word = self.D[word] wordid = q.var(torch.LongTensor([word])).v ret, _ = self(wordid) return ret.squeeze(0).data.numpy() except Exception: return None
def forward(self, tgt_seq, src_enc, src_mask=None, tgt_pos=None): # Word embedding look up dec_input, dec_mask = self.tgt_word_emb(tgt_seq) if dec_mask is not None: dec_mask = dec_mask.unsqueeze(1).repeat(1, dec_input.size(1), 1) else: dec_mask = q.var( np.ones((tgt_seq.size(0), tgt_seq.size(1), tgt_seq.size(1)))).v if tgt_pos is None: tgt_pos = torch.arange(0, tgt_seq.size(1)) \ .unsqueeze(0).repeat(tgt_seq.size(0), 1).long() tgt_pos = q.var(tgt_pos).v # Position Encoding addition pos_input = self.position_enc(tgt_pos) if not self.cat_pos_enc: dec_input = dec_input + pos_input else: dec_input = torch.cat([dec_input, pos_input], 2) dec_outputs, dec_slf_attns, dec_enc_attns = [], [], [] # Decode # dec_slf_attn_pad_mask = get_attn_padding_mask(tgt_seq, tgt_seq) dec_slf_attn_sub_mask = -1 * get_attn_subsequent_mask(tgt_seq) + 1 dec_slf_attn_mask = q.var(dec_mask.data.byte() * dec_slf_attn_sub_mask).v dec_output = dec_input for dec_layer in self.layer_stack: dec_output, dec_slf_attn, dec_enc_attn = dec_layer( dec_output, src_enc, slf_attn_mask=dec_slf_attn_mask, dec_enc_attn_mask=src_mask) dec_outputs += [dec_output] dec_slf_attns += [dec_slf_attn] dec_enc_attns += [dec_enc_attn] return dec_output #dec_outputs, dec_slf_attns, dec_enc_attns
def _reverse_seq(x, mask=None): if mask is None: cum = q.var( torch.arange(0, x.size(1)).unsqueeze(0).repeat(x.size(0), 1)).cuda(x).v else: cum = torch.cumsum(mask, 1) idx = torch.max(cum, 1, keepdim=True)[0] - cum idx = idx.long().unsqueeze(2).repeat(1, 1, x.size(2)) retx = torch.gather(x, 1, idx) return retx
def _forward(self, x_t, h_tm1, t=None): if self.dropout_in: x_t = self.dropout_in(x_t) if self.dropout_rec: h_tm1 = self.dropout_rec(h_tm1) h_t = self.main_gate(x_t, h_tm1) if self.zoneout: zoner = q.var(torch.ones(h_t.size())).cuda(crit=h_t).v zoner = self.zoneout(zoner) h_t = torch.mul(1 - zoner, h_tm1) + torch.mul(zoner, h_t) return h_t, h_t
def test_equivalent_to_qelos(self): m = OriginalMultiHeadAttention(4, 16, 10, 12, 0) mym = q.MultiHeadAttention(4, 16, 10, 12, 0) mym.w_qs.data = m.w_qs.permute(1, 0, 2).contiguous().view(16, -1).data mym.w_ks.data = m.w_ks.permute(1, 0, 2).contiguous().view(16, -1).data mym.w_vs.data = m.w_vs.permute(1, 0, 2).contiguous().view(16, -1).data # mym.w_qs, mym.w_ks, mym.w_vs = m.w_qs, m.w_ks, m.w_vs mym.proj, mym.layer_norm = m.proj, m.layer_norm Q = q.var(np.random.random((5, 2, 16)).astype("float32")).v K = q.var(np.random.random((5, 6, 16)).astype("float32")).v M = q.var( np.asarray([ [1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1], ])).v V = q.var(np.random.random((5, 6, 16)).astype("float32")).v outs, atts = m(Q, K, V, (-1 * M + 1).byte().data.unsqueeze(1).repeat(1, 2, 1)) self.assertEqual(outs.size(), (5, 2, 16)) self.assertEqual(atts.size(), (20, 2, 6)) myouts, myatts = mym(Q, K, V, M) m_em = q.get_emitted("mha") mym_em = q.get_emitted("mymha") # for k in m_em: # self.assertTrue(np.allclose(m_em[k].data.numpy(), mym_em[k].data.numpy())) self.assertTrue(np.allclose(myatts.data.numpy(), atts.data.numpy())) print(myouts[0]) print(outs[0]) self.assertTrue( np.allclose(myouts.data.numpy(), outs.data.numpy(), atol=1e-7))
def forward(self, x, mask=None): # (batsize, indim), (batsize, outdim) if mask is not None: mask = mask.long() # select data, compute vectors, build switcher msk = mask.sum(0) # --> (outdim,) msk = (msk > 0).long() compute_ids = msk.data.nonzero() if len(compute_ids.size()) > 0: # not all zeros compute_ids = compute_ids.squeeze(1) data_select = self.data[compute_ids] comp_weight = self.computer( data_select) # (num_data_select, indim) comp_weight = comp_weight.contiguous() indim = comp_weight.size(1) if self.base_weight is None or self.base_weight.size( 1) != indim: self.base_weight = q.var(torch.zeros(1, indim)).cuda(x).v weight = torch.cat([self.base_weight, comp_weight], 0) index_transform = (torch.cumsum(msk, 0) * msk).long() weight = weight.index_select(0, index_transform) else: data_select = self.data[0:1] comp_weight = self.computer( data_select) # (num_data_select, indim) comp_weight = comp_weight.contiguous() indim = comp_weight.size(1) weight = q.var(torch.zeros(mask.size(1), indim)).cuda(x).v else: weight = self.computer(self.data) weight = weight.contiguous() out = torch.mm(x, weight.t()) if self.bias: bias = self.bias if mask is not None else self.bias * mask out += bias if mask is not None: out = out * mask.float() return out #, mask ?
def forward(self, x): # Set initial states h0 = q.var(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda(crit=x).v # c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # Forward propagate RNN if self.mode == "qrnn": out = self.rnn(x) else: out, _ = self.rnn(x, h0) # Returns final state out = out[:, -1, :] return out
def forward(self, x): # Set initial states h0 = q.var( torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda(crit=x).v #c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # Forward propagate RNN if mode == "qrnn" or mode == "stack": out = self.rnn(x) else: out, _ = self.rnn(x, h0) # Decode hidden state of last time step out = nn.LogSoftmax()(self.fc(out[:, -1, :])) return out
def forward(self, x, mask=None): self.reset_state() h_0 = self._get_init_states(x.size(0)) if self._reverse: x = _reverse_seq(x, mask=mask) if mask is not None: x = x * mask.unsqueeze(2).float() y, s_t = self.nnlayer(x, h_0) self.set_states(s_t) # DON'T TRUST FINAL STATES WHEN MASK IS NOT NONE #return y if mask is None: y_t = y[:, -1, :] else: last = (torch.sum(mask, 1) - 1).long() # (batsize): lengths of sequences - 1 rng = q.var(torch.arange(0, x.size(0)).long()).cuda(x).v # (batsize) y_t = y[rng.data, last.data, :] # if mask is not None: # self.set_states(y_t) ret = tuple() if self._return_final: ret += (y_t, ) if self._return_all: if self._reverse: y = _reverse_seq(y, mask=mask) if mask is not None: y = y * mask.unsqueeze(2).float() ret += (y, ) if self._return_mask: ret += (mask, ) if len(ret) == 1: return ret[0] elif len(ret) == 0: print("no output specified") return else: return ret
def get_init_states(self, arg): """ :param arg: batch size (will generate and return compatible init states) or None (will return what is stored) :return: initial states, states that have previously been set or newly generated zero states based on given batch size """ if arg is None: return self._init_states assert (q.isnumber(arg) or arg is None) # is batch size if self._init_states is None: # no states have been set using .set_init_states() _init_states = [None] * self.numstates else: _init_states = self._init_states # fill up with zeros and expand where necessary assert (self.numstates == len(_init_states)) for i in range(len(_init_states)): statespec = self.state_spec[i] initstate = _init_states[i] if initstate is None: state_0 = q.var(torch.zeros(statespec)).cuda( next(self.parameters()).is_cuda).v _init_states[i] = state_0 initstate = state_0 if initstate.dim( ) == 2: # init state set differently for different batches if arg > initstate.size(0): raise Exception( "asked for a bigger batch size than init states") elif arg == initstate.size(0): pass else: if arg == 0: return initstate[0] else: return initstate[:arg] elif initstate.dim() == 1 and arg > 0: _init_states[i] = initstate.unsqueeze(0).expand( arg, initstate.size(-1)) else: raise Exception( "initial states set to wrong dimensional values. Must be 1D (will be repeated) or 2D." ) return _init_states
def forward(self, x_t, t=None, mask_t=None): batsize = x_t.size(0) states = self.get_states(batsize) ret = self._forward(x_t, *states, t=t) y_t = ret[0] newstates = ret[1:] st = [] if mask_t is not None: mask_t = mask_t.float() for newstate, oldstate in zip(newstates, states): newstate = newstate * mask_t + oldstate * (1 - mask_t) st.append(newstate) if mask_t is not None: # moved from RNNLayer if self._y_tm1 is None: self._y_tm1 = q.var(torch.zeros( y_t.size())).cuda(crit=y_t).v y_t = y_t * mask_t + self._y_tm1 * (1 - mask_t) self._y_tm1 = y_t else: st = newstates self.set_states(*st) return y_t
def test_dynamic_bypass_stack(self): data = q.var(np.random.random((3, 5)).astype(dtype="float32")).v stack = q.Stack() nlayers = 5 for i in range(nlayers): stack.add(q.argsave.spec(a=0), q.Forward(5, 5), q.Forward(5, 5), q.argmap.spec(0, ["a"]), q.Lambda(lambda x, y: x + y)) out = stack(data) print(out) self.assertEqual(out.size(), (3, 5)) out.sum().backward() forwards = [] for layer in stack.layers: if isinstance(layer, q.Forward): self.assertTrue(layer.lin.weight.grad is not None) self.assertTrue(layer.lin.bias.grad is not None) print(layer.lin.weight.grad.norm(2)) self.assertTrue(layer.lin.weight.grad.norm(2).data[0] > 0) self.assertTrue(layer.lin.bias.grad.norm(2).data[0] > 0) forwards.append(layer) self.assertEqual(len(forwards), nlayers * 2)
def main( lr=0.5, epochs=30, batsize=32, embdim=90, encdim=90, mode="cell", # "fast" or "cell" wreg=0.0001, cuda=False, gpu=1, ): if cuda: torch.cuda.set_device(gpu) usecuda = cuda vocsize = 50 # create datasets tensor tt.tick("loading data") sequences = np.random.randint(0, vocsize, (batsize * 100, 16)) # wrap in dataset dataset = q.TensorDataset(sequences[:batsize * 80], sequences[:batsize * 80]) validdataset = q.TensorDataset(sequences[batsize * 80:], sequences[batsize * 80:]) dataloader = DataLoader(dataset=dataset, batch_size=batsize, shuffle=True) validdataloader = DataLoader(dataset=validdataset, batch_size=batsize, shuffle=False) tt.tock("data loaded") # model tt.tick("building model") embedder = nn.Embedding(vocsize, embdim) encoder = q.RecurrentStack( embedder, q.SRUCell(encdim).to_layer(), q.SRUCell(encdim).to_layer(), q.SRUCell(encdim).to_layer(), q.SRUCell(encdim).to_layer().return_final(), ) if mode == "fast": decoder = q.AttentionDecoder( attention=q.Attention().forward_gen(encdim, encdim, encdim), embedder=embedder, core=q.RecurrentStack(q.GRULayer(embdim, encdim)), smo=q.Stack(nn.Linear(encdim + encdim, vocsize), q.LogSoftmax()), return_att=True) else: decoder = q.AttentionDecoderCell( attention=q.Attention().forward_gen(encdim, encdim + embdim, encdim), embedder=embedder, core=q.RecStack( q.GRUCell(embdim + encdim, encdim, use_cudnn_cell=False, rec_batch_norm=None, activation="crelu")), smo=q.Stack(nn.Linear(encdim + encdim, vocsize), q.LogSoftmax()), att_after_update=False, ctx_to_decinp=True, decinp_to_att=True, return_att=True, ).to_decoder() m = EncDec(encoder, decoder, mode=mode) losses = q.lossarray(q.SeqNLLLoss(ignore_index=None), q.SeqAccuracy(ignore_index=None), q.SeqElemAccuracy(ignore_index=None)) validlosses = q.lossarray(q.SeqNLLLoss(ignore_index=None), q.SeqAccuracy(ignore_index=None), q.SeqElemAccuracy(ignore_index=None)) optimizer = torch.optim.Adadelta(m.parameters(), lr=lr, weight_decay=wreg) tt.tock("model built") q.train(m).cuda(usecuda).train_on(dataloader, losses)\ .set_batch_transformer(lambda x, y: (x, y[:, :-1], y[:, 1:]))\ .valid_on(validdataloader, validlosses)\ .optimizer(optimizer).clip_grad_norm(2.)\ .train(epochs) testdat = np.random.randint(0, vocsize, (batsize, 20)) testdata = q.var(torch.from_numpy(testdat)).cuda(usecuda).v testdata_out = q.var(torch.from_numpy(testdat)).cuda(usecuda).v if mode == "cell" and False: inv_idx = torch.arange(testdata.size(1) - 1, -1, -1).long() testdata = testdata.index_select(1, inv_idx) probs, attw = m(testdata, testdata_out[:, :-1]) def plot(x): sns.heatmap(x) plt.show() embed()
def main( # Hyper Parameters sequence_length=28, input_size=28, hidden_size=128, num_layers=2, num_classes=10, batch_size=100, num_epochs=2, learning_rate=0.01, gpu=False, mode="qrnn" # "nn" or "qrnn" or "stack" ): tt = ticktock("script") tt.msg("using q: {}".format(mode)) # MNIST Dataset train_dataset = dsets.MNIST(root='../../../datasets/mnist/', train=True, transform=transforms.ToTensor(), download=True) test_dataset = dsets.MNIST(root='../../../datasets/mnist/', train=False, transform=transforms.ToTensor()) # Data Loader (Input Pipeline) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) # RNN Model (Many-to-One) class RNN(nn.Module): def __init__(self, input_size, hidden_size, num_layers, num_classes): super(RNN, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers if mode == "qrnn": tt.msg("using q.RNN") self.rnn = RecStack(*[GRUCell(input_size, hidden_size, use_cudnn_cell=False, rec_batch_norm="main")] + [GRUCell(hidden_size, hidden_size) for i in range(num_layers - 1)])\ .to_layer().return_all() elif mode == "nn": tt.msg("using nn.RNN") self.rnn = nn.GRU(input_size, hidden_size, num_layers, batch_first=True) elif mode == "stack": self.rnn = q.RecurrentStack( *([q.GRULayer(input_size, hidden_size)] + [ q.GRULayer(hidden_size, hidden_size) for i in range(num_layers - 1) ])) self.fc = nn.Linear(hidden_size, num_classes) def forward(self, x): # Set initial states h0 = q.var( torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda(crit=x).v #c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # Forward propagate RNN if mode == "qrnn" or mode == "stack": out = self.rnn(x) else: out, _ = self.rnn(x, h0) # Decode hidden state of last time step out = nn.LogSoftmax()(self.fc(out[:, -1, :])) return out if gpu: q.var.all_cuda = True rnn = RNN(input_size, hidden_size, num_layers, num_classes) if gpu: rnn.cuda() # Loss and Optimizer criterion = q.lossarray(nn.NLLLoss()) if gpu: criterion.cuda() optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate) q.train(rnn).train_on(train_loader, criterion).cuda(gpu)\ .optimizer(optimizer).set_batch_transformer(lambda x, y: (x.view(-1, sequence_length, input_size), y))\ .train(num_epochs) # tt.msg("training") # # Train the Model # for epoch in range(num_epochs): # tt.tick() # btt = ticktock("batch") # btt.tick() # for i, (images, labels) in enumerate(train_loader): # #btt.tick("doing batch") # images = q.var(images.view(-1, sequence_length, input_size)).cuda(crit=gpu).v # labels = q.var(labels).cuda(crit=gpu).v # # # Forward + Backward + Optimize # optimizer.zero_grad() # outputs = rnn(images) # loss = criterion(outputs, labels) # loss.backward() # optimizer.step() # # if (i+1) % 100 == 0: # btt.tock("100 batches done") # print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' # %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.datasets[0])) # btt.tick() # #tt.tock("batch done") # tt.tock("epoch {} done".format(epoch)) # Test the Model correct = 0 total = 0 for images, labels in test_loader: images = q.var(images.view(-1, sequence_length, input_size)).cuda(crit=gpu).v labels = q.var(labels).cuda(crit=gpu).v outputs = rnn(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels.data).sum() print('Test Accuracy of the model on the 10000 test images: %d %%' % (100 * correct / total)) # Save the Model torch.save(rnn.state_dict(), 'rnn.pkl')
def trainloop(self): stop = False self.tt.tick("training") tt = ticktock("-") current_epoch = 0 totaltrainbats = len(self.traindataloader) while not stop: self.current_epoch = current_epoch stop = self.current_epoch + 1 == self.epochs self.trainlosses.push_and_reset() tt.tick() self.model.train() for i, batch in enumerate(self.traindataloader): self.optim.zero_grad() params = self.model.parameters() batch = [ q.var(batch_e).cuda(self.usecuda).v for batch_e in batch ] if self.transform_batch is not None: batch = self.transform_batch(*batch) modelouts = self.model(*batch[:-1]) if not issequence(modelouts): modelouts = [modelouts] trainlosses = self.trainlosses(modelouts[0], batch[-1]) trainlosses[0].backward() # grad total norm tgn0 = None if self._clip_grad_norm is not None: tgn0 = nn.utils.clip_grad_norm(self.model.parameters(), self._clip_grad_norm) if tgn0 is not None: tgn = tgn0 else: tgn = 0 for param in self.model.parameters(): tgn += param.grad.pow( 2).sum() if param.grad is not None else 0 tgn = tgn.pow(1. / 2) tgn = tgn.data[0] self.optim.step() tt.live( "train - Epoch {}/{} - [{}/{}]: {} - TGN: {:.4f}".format( self.current_epoch + 1, self.epochs, i + 1, totaltrainbats, self.trainlosses.pp(), tgn)) ttmsg = "Epoch {}/{} -- train: {}"\ .format( self.current_epoch+1, self.epochs, self.trainlosses.pp() ) train_epoch_losses = self.trainlosses.get_agg_errors() valid_epoch_losses = [] if self.validlosses is not None: self.model.eval() self.validlosses.push_and_reset() totalvalidbats = len(self.validdataloader) for i, batch in enumerate(self.validdataloader): batch = [ q.var(batch_e).cuda(self.usecuda).v for batch_e in batch ] if self.transform_batch is not None: batch = self.transform_batch(*batch) modelouts = self.model(*batch[:-1]) if not issequence(modelouts): modelouts = [modelouts] validlosses = self.validlosses(modelouts[0], batch[-1]) tt.live("valid - Epoch {}/{} - [{}/{}]: {}".format( self.current_epoch + 1, self.epochs, i + 1, totalvalidbats, self.validlosses.pp())) ttmsg += " -- valid: {}".format(self.validlosses.pp()) valid_epoch_losses = self.validlosses.get_agg_errors() tt.stoplive() tt.tock(ttmsg) if self._earlystop: doearlystop = self.earlystop_eval(train_epoch_losses, valid_epoch_losses) if doearlystop: tt.msg("stopping early") stop = stop or doearlystop current_epoch += 1 self.tt.tock("trained")