Exemplo n.º 1
0
    def _forward(self,
                 x_t,
                 h_tm1,
                 t=None):  # (batsize, indim), (batsize, outdim)
        if self.dropout_in:
            x_t = self.dropout_in(x_t)
        if self.dropout_rec:
            h_tm1 = self.dropout_rec(h_tm1)
        if self.shared_dropout_rec:
            if self.shared_dropout_reccer is None:
                ones = q.var(torch.ones(h_tm1.size())).cuda(crit=h_tm1).v
                self.shared_dropout_reccer = [self.shared_dropout_rec(ones)]
            h_tm1 = torch.mul(h_tm1, self.shared_dropout_reccer[0])

        h_t = self.apply_nncell(x_t, h_tm1, t=t)

        if self.zoneout:
            if self.zoner is None:
                self.zoner = q.var(torch.ones(h_t.size())).cuda(crit=h_t).v
            zoner = self.zoneout(self.zoner)
            h_t = torch.mul(1 - zoner, h_tm1) + torch.mul(zoner, h_t)
        if self.shared_zoneout:
            if self.shared_zoneouter is None:
                ones = q.var(torch.ones(h_t.size())).cuda(crit=h_t).v
                self.shared_zoneouter = [self.shared_zoneout(ones)]
            h_t = torch.mul(1 - self.shared_zoneouter[0], h_tm1) + torch.mul(
                self.shared_zoneouter[0], h_t)
        return h_t, h_t
Exemplo n.º 2
0
    def _forward(self, x_t, c_tm1, t=None):
        if self.dropout_in:
            x_t = self.dropout_in(x_t)
        if self.dropout_rec:
            c_tm1 = self.dropout_rec(c_tm1)
        if self.shared_dropout_rec:
            if self.shared_dropout_reccer is None:
                ones = q.var(torch.ones(c_tm1.size())).cuda(crit=c_tm1).v
                self.shared_dropout_reccer = [self.shared_dropout_rec(ones)]
            c_tm1 = torch.mul(c_tm1, self.shared_dropout_reccer[0])

        y_t, c_t = self.apply_nncell(x_t, c_tm1, t=t)

        if self.zoneout:
            if self.zoner is None:
                self.zoner = q.var(torch.ones(c_t.size())).cuda(crit=c_t).v
            zoner = self.zoneout(self.zoner)
            c_t = torch.mul(1 - zoner, c_tm1) + torch.mul(zoner, c_t)
        if self.shared_zoneout:
            if self.shared_zoneouter is None:
                ones = q.var(torch.ones(c_t.size())).cuda(crit=c_t).v
                self.shared_zoneouter = [self.shared_zoneout(ones)]
            c_t = torch.mul(1 - self.shared_zoneouter[0], c_tm1) + torch.mul(
                self.shared_zoneouter[0], c_t)
        return y_t, c_t
Exemplo n.º 3
0
    def test_equivalent_to_qelos_masked(self):
        m = ScaledDotProductAttention(10, attn_dropout=0)
        refm = q.Attention().dot_gen().scale(10**0.5)

        Q = q.var(np.random.random((5, 1, 10)).astype("float32")).v
        K = q.var(np.random.random((5, 6, 10)).astype("float32")).v
        M = q.var(
            np.asarray([
                [1, 0, 0, 0, 0, 0],
                [1, 1, 1, 0, 0, 0],
                [1, 1, 1, 0, 0, 0],
                [1, 1, 1, 1, 0, 0],
                [1, 1, 1, 1, 1, 1],
            ])).v
        V = q.var(np.random.random((5, 6, 11)).astype("float32")).v

        ctx, atn = m(Q, K, V, attn_mask=(-1 * M + 1).byte().data.unsqueeze(1))
        refatn = refm.attgen(K, Q, mask=M)
        refctx = refm.attcon(V, refatn)

        print(atn)
        print(refatn)

        self.assertTrue(np.allclose(atn.data.numpy(), refatn.data.numpy()))
        self.assertTrue(np.allclose(ctx.data.numpy(), refctx.data.numpy()))
Exemplo n.º 4
0
    def test_decoder_shape(self):
        wdic = "<MASK> a b c d e f g h i j k l m n o p".split()
        wdic = dict(zip(wdic, range(len(wdic))))
        emb = q.WordEmb(10, worddic=wdic)
        m = q.AYNDecoder(emb, n_max_seq=7, n_layers=3, n_head=2,
                         d_k=4, d_v=6, d_pos_vec=6, d_model=16,
                         d_inner_hid=20, dropout=0)
        src_seq = q.var(np.random.randint(1, max(wdic.values()), (5, 7))).v
        src_seq_mask_starts = np.random.randint(1, 7, (5,), dtype="int64")
        src_seq_mask = np.ones_like(src_seq.data.numpy())
        for i in range(5):
            src_seq_mask[i, :src_seq_mask_starts[i]] = 0
        src_seq_mask = q.var(src_seq_mask).v
        src_seq.masked_fill_(src_seq_mask.byte(), 0)
        src_pos = q.var(np.arange(0, 7, dtype="int64")).v
        src_pos = src_pos.unsqueeze(0).repeat(5, 1)

        ctx = q.var(np.random.random((5, 8, 16)).astype("float32")).v

        ctx_seq_mask_starts = np.random.randint(1, 8, (5,), dtype="int64")
        ctx_seq_mask = np.ones((5, 8))
        for i in range(5):
            ctx_seq_mask[i, :ctx_seq_mask_starts[i]] = 0
        ctx_seq_mask = -1*q.var(ctx_seq_mask).v.byte()+1

        out = m(src_seq, ctx, ctx_seq_mask)

        print(out)
        self.assertEqual(out.size(), (5, 7, 16))

        loss = out.sum()
        loss.backward()
Exemplo n.º 5
0
def test_decoder_loss():
    l = DecoderLoss()
    logprobs = -np.random.random((3, 5, 4))
    gold = np.asarray([[1, 2, 3, 0, 0], [1, 1, 0, 0, 0], [3, 3, 3, 3, 3]])
    logprobs = q.var(torch.FloatTensor(logprobs)).v
    gold = q.var(torch.LongTensor(gold)).v
    loss = l(logprobs, gold)
    print loss
Exemplo n.º 6
0
    def test_multigrad(self):
        class Module(nn.Module):
            def __init__(self):
                super(Module, self).__init__()
                self.one = nn.Linear(3, 3)
                self.two = nn.Linear(3, 3)

            def forward(self, x):
                return self.two(self.one(x))

        net = Module()
        inp1 = q.var(torch.randn(3)).v
        inp2 = q.var(torch.randn(3)).v

        lossa = net(inp1).sum()
        lossa.backward()
        agrads = []
        for p in net.parameters():
            print(p.grad)
            agrads.append(p.grad.data.numpy() + 0)

        net.zero_grad()

        lossb = net(inp2).sum()
        lossb.backward()
        bgrads = []
        for p in net.parameters():
            print(p.grad)
            bgrads.append(p.grad.data.numpy() + 0)

        net.zero_grad()

        loss = net(inp2).sum() + net(inp1).sum()
        loss.backward()
        grads = []
        for p in net.parameters():
            print(p.grad)
            grads.append(p.grad.data.numpy() + 0)

        net.zero_grad()

        lossa = net(inp1).sum()
        lossa.backward()
        lossb = net(inp2).sum()
        lossb.backward()

        sgrads = []
        for p in net.parameters():
            print(p.grad)
            sgrads.append(p.grad.data.numpy() + 0)

        for a, b, t, s in zip(agrads, bgrads, grads, sgrads):
            self.assertTrue(np.allclose(a + b, t))
            self.assertTrue(np.allclose(t, s))

        print("qsdf")
Exemplo n.º 7
0
 def test_sameasbase(self):
     words = "inception earlgrey <MASK>"
     pred, mask = self.emb(
         q.var(torch.LongTensor([self.emb * x for x in words.split()])).v)
     pred = pred.data.numpy()
     gpred, msk = self.baseemb(
         q.var(torch.LongTensor([self.baseemb * x
                                 for x in words.split()])).v)
     gpred = gpred.data.numpy()
     self.assertTrue(np.allclose(pred, gpred))
Exemplo n.º 8
0
 def test_notasbase(self):
     words = "the his monkey key"
     pred, mask = self.emb(
         q.var(torch.LongTensor([self.emb * x for x in words.split()])).v)
     pred = pred.data.numpy()
     gpred, msk = self.baseemb(
         q.var(torch.LongTensor([self.baseemb * x
                                 for x in words.split()])).v)
     gpred = gpred.data.numpy()
     self.assertFalse(np.allclose(pred, gpred))
Exemplo n.º 9
0
 def test_sameasover(self):
     words = "the his monkey key"
     pred, msk = self.emb(
         q.var(torch.LongTensor([self.emb * x for x in words.split()])).v)
     pred = pred.data.numpy()
     gpred, _ = self.overemb(
         q.var(torch.LongTensor([self.overemb * x
                                 for x in words.split()])).v)
     gpred = gpred.data.numpy()
     self.assertTrue(np.allclose(pred, gpred))
Exemplo n.º 10
0
 def test_notasover(self):
     words = "inception earlgrey"
     pred, mask = self.emb(
         q.var(torch.LongTensor([self.emb * x for x in words.split()])).v)
     pred = pred.data.numpy()
     gpred, _ = self.overemb(
         q.var(torch.LongTensor([self.baseemb * x
                                 for x in words.split()])).v)
     gpred = gpred.data.numpy()
     self.assertFalse(np.allclose(pred, gpred))
Exemplo n.º 11
0
    def forward(self, src_seq, src_pos=None):
        # Word embedding look up
        enc_slf_attn_mask = None
        enc_input = self.src_word_emb(src_seq)
        if isinstance(enc_input, tuple) and len(enc_input) == 2:
            enc_input, enc_slf_attn_mask = enc_input

        if src_pos is None:
            src_pos = torch.arange(0, src_seq.size(1))\
                .unsqueeze(0).repeat(src_seq.size(0), 1).long()
            src_pos = q.var(src_pos).v

        # Position Encoding addition
        pos_input = self.position_enc(src_pos)
        if not self.cat_pos_enc:
            enc_input = enc_input + pos_input  # does the paper add position encodings? --> yes
        else:
            enc_input = torch.cat([enc_input, pos_input], 2)

        enc_outputs, enc_slf_attns = [], []

        enc_output = enc_input
        # enc_slf_attn_mask = get_attn_padding_mask(src_seq, src_seq)
        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(
                enc_output, slf_attn_mask=enc_slf_attn_mask)
            enc_outputs += [enc_output]
            enc_slf_attns += [enc_slf_attn]

        return enc_output  # enc_outputs, enc_slf_attns
Exemplo n.º 12
0
def number2charseq(x):
    dic = {
        0: "_zero  ",
        1: "_one   ",
        2: "_two   ",
        3: "_three ",
        4: "_four  ",
        5: "_five  ",
        6: "_six   ",
        7: "_seven ",
        8: "_eight ",
        9: "_nine  "
    }
    acc = []
    tocuda = False
    if x.is_cuda:
        x = x.cpu()
        tocuda = True
    for i in range(x.size(0)):
        word = x[i].data.numpy()[0]
        word = dic[word]
        word = map(lambda x: ord(x) if x is not " " else 0, word)
        acc.append(word)
    acc = np.asarray(acc)
    acc = q.var(torch.LongTensor(acc)).cuda(crit=tocuda).v
    return acc
Exemplo n.º 13
0
def test_model(encoder, decoder, m, questions, queries, vnt):
    questions = q.var(questions[:10]).v
    queries = q.var(queries[:10]).v
    vnt = q.var(vnt[:10]).v

    # try encoder
    ctx, ctxmask, finalctx = encoder(questions)
    # print(ctx.size())
    assert(ctx.size(0) == finalctx.size(0))
    assert(ctx.size(1) == ctxmask.float().size(1))
    assert(ctx.size(2) == finalctx.size(1))
    maskedctx = ctx * ctxmask.unsqueeze(2).float()
    assert((ctx.norm(2) == maskedctx.norm(2)).data.numpy()[0])
    # print(ctx.norm(2) - maskedctx.norm(2))
    loss = finalctx.sum()
    loss.backward()
    encoder.zero_grad()
    ctx, ctxmask, finalctx = encoder(questions)
    loss = finalctx.sum()
    loss.backward()

    print("dry run of encoder didn't throw errors")
    # decoder.block.embedder = nn.Embedding(100000, 200, padding_idx=0)
    # decoder.block.smo = q.Stack(
    #                                      q.argsave.spec(mask={"mask"}),
    #                                      q.argmap.spec(0),
    #                                      nn.Linear(200, 11075),
    #                                      q.argmap.spec(0, mask=["mask"]),
    #                                      q.LogSoftmax(),
    #                                      q.argmap.spec(0),
    #                     )
    # decoder.block.smo = None
    # try decoder cell
    for t in range(3):
        # ctx, ctxmask, finalctx = encoder(questions)
        decoder.block.core.reset_state()        # ESSENTIAL !!! otherwise double .backward() error
        decoder.set_init_states(finalctx.detach())
        decoder.block.zero_grad()
        outmaskt=vnt[:, t]
        # outmaskt=q.var(np.ones_like(vnt[:, t].data.numpy()).astype("int64")).v
        y_t = decoder.block(queries[:, t], ctx.detach(), ctxmask=ctxmask.detach(), t=t, outmask_t=outmaskt)
        loss = torch.max(y_t)
        print(loss)
        loss.backward()
        print("backward done")
    print("dry run of decoder cell didn't throw errors")
    q.embed()
Exemplo n.º 14
0
    def test_equivalent_to_qelos(self):
        m = ScaledDotProductAttention(10, attn_dropout=0)
        refm = q.Attention().dot_gen().scale(10**0.5)

        Q = q.var(np.random.random((5, 4, 10)).astype("float32")).v
        K = q.var(np.random.random((5, 6, 10)).astype("float32")).v
        V = q.var(np.random.random((5, 6, 11)).astype("float32")).v

        ctx, atn = m(Q, K, V)
        refatn = refm.attgen(K, Q)
        refctx = refm.attcon(V, refatn)

        print(atn)
        print(refatn)

        self.assertTrue(np.allclose(atn.data.numpy(), refatn.data.numpy()))
        self.assertTrue(np.allclose(ctx.data.numpy(), refctx.data.numpy()))
Exemplo n.º 15
0
 def test_bypass_stack(self):
     data = q.var(np.random.random((3, 5)).astype(dtype="float32")).v
     stack = q.Stack(q.Forward(5, 5), q.argsave.spec(a=0), q.Forward(5, 5),
                     q.Forward(5, 5), q.argmap.spec(0, ["a"]),
                     q.Lambda(lambda x, y: torch.cat([x, y], 1)),
                     q.Forward(10, 7))
     out = stack(data)
     print(out)
     self.assertEqual(out.size(), (3, 7))
Exemplo n.º 16
0
 def getvector(self, word):
     try:
         if isstring(word):
             word = self.D[word]
         wordid = q.var(torch.LongTensor([word])).v
         ret, _ = self(wordid)
         return ret.squeeze(0).data.numpy()
     except Exception:
         return None
Exemplo n.º 17
0
    def forward(self, tgt_seq, src_enc, src_mask=None, tgt_pos=None):
        # Word embedding look up
        dec_input, dec_mask = self.tgt_word_emb(tgt_seq)
        if dec_mask is not None:
            dec_mask = dec_mask.unsqueeze(1).repeat(1, dec_input.size(1), 1)
        else:
            dec_mask = q.var(
                np.ones((tgt_seq.size(0), tgt_seq.size(1), tgt_seq.size(1)))).v

        if tgt_pos is None:
            tgt_pos = torch.arange(0, tgt_seq.size(1)) \
                .unsqueeze(0).repeat(tgt_seq.size(0), 1).long()
            tgt_pos = q.var(tgt_pos).v

        # Position Encoding addition
        pos_input = self.position_enc(tgt_pos)
        if not self.cat_pos_enc:
            dec_input = dec_input + pos_input
        else:
            dec_input = torch.cat([dec_input, pos_input], 2)

        dec_outputs, dec_slf_attns, dec_enc_attns = [], [], []

        # Decode
        # dec_slf_attn_pad_mask = get_attn_padding_mask(tgt_seq, tgt_seq)
        dec_slf_attn_sub_mask = -1 * get_attn_subsequent_mask(tgt_seq) + 1
        dec_slf_attn_mask = q.var(dec_mask.data.byte() *
                                  dec_slf_attn_sub_mask).v

        dec_output = dec_input
        for dec_layer in self.layer_stack:
            dec_output, dec_slf_attn, dec_enc_attn = dec_layer(
                dec_output,
                src_enc,
                slf_attn_mask=dec_slf_attn_mask,
                dec_enc_attn_mask=src_mask)

            dec_outputs += [dec_output]
            dec_slf_attns += [dec_slf_attn]
            dec_enc_attns += [dec_enc_attn]

        return dec_output  #dec_outputs, dec_slf_attns, dec_enc_attns
Exemplo n.º 18
0
def _reverse_seq(x, mask=None):
    if mask is None:
        cum = q.var(
            torch.arange(0, x.size(1)).unsqueeze(0).repeat(x.size(0),
                                                           1)).cuda(x).v
    else:
        cum = torch.cumsum(mask, 1)
    idx = torch.max(cum, 1, keepdim=True)[0] - cum
    idx = idx.long().unsqueeze(2).repeat(1, 1, x.size(2))
    retx = torch.gather(x, 1, idx)
    return retx
Exemplo n.º 19
0
 def _forward(self, x_t, h_tm1, t=None):
     if self.dropout_in:
         x_t = self.dropout_in(x_t)
     if self.dropout_rec:
         h_tm1 = self.dropout_rec(h_tm1)
     h_t = self.main_gate(x_t, h_tm1)
     if self.zoneout:
         zoner = q.var(torch.ones(h_t.size())).cuda(crit=h_t).v
         zoner = self.zoneout(zoner)
         h_t = torch.mul(1 - zoner, h_tm1) + torch.mul(zoner, h_t)
     return h_t, h_t
Exemplo n.º 20
0
    def test_equivalent_to_qelos(self):
        m = OriginalMultiHeadAttention(4, 16, 10, 12, 0)
        mym = q.MultiHeadAttention(4, 16, 10, 12, 0)
        mym.w_qs.data = m.w_qs.permute(1, 0, 2).contiguous().view(16, -1).data
        mym.w_ks.data = m.w_ks.permute(1, 0, 2).contiguous().view(16, -1).data
        mym.w_vs.data = m.w_vs.permute(1, 0, 2).contiguous().view(16, -1).data
        # mym.w_qs, mym.w_ks, mym.w_vs = m.w_qs, m.w_ks, m.w_vs
        mym.proj, mym.layer_norm = m.proj, m.layer_norm

        Q = q.var(np.random.random((5, 2, 16)).astype("float32")).v
        K = q.var(np.random.random((5, 6, 16)).astype("float32")).v
        M = q.var(
            np.asarray([
                [1, 0, 0, 0, 0, 0],
                [1, 1, 1, 0, 0, 0],
                [1, 1, 1, 0, 0, 0],
                [1, 1, 1, 1, 0, 0],
                [1, 1, 1, 1, 1, 1],
            ])).v
        V = q.var(np.random.random((5, 6, 16)).astype("float32")).v

        outs, atts = m(Q, K, V,
                       (-1 * M + 1).byte().data.unsqueeze(1).repeat(1, 2, 1))

        self.assertEqual(outs.size(), (5, 2, 16))
        self.assertEqual(atts.size(), (20, 2, 6))

        myouts, myatts = mym(Q, K, V, M)

        m_em = q.get_emitted("mha")
        mym_em = q.get_emitted("mymha")
        # for k in m_em:
        #     self.assertTrue(np.allclose(m_em[k].data.numpy(), mym_em[k].data.numpy()))

        self.assertTrue(np.allclose(myatts.data.numpy(), atts.data.numpy()))
        print(myouts[0])
        print(outs[0])
        self.assertTrue(
            np.allclose(myouts.data.numpy(), outs.data.numpy(), atol=1e-7))
Exemplo n.º 21
0
 def forward(self, x, mask=None):  # (batsize, indim), (batsize, outdim)
     if mask is not None:
         mask = mask.long()
         # select data, compute vectors, build switcher
         msk = mask.sum(0)  # --> (outdim,)
         msk = (msk > 0).long()
         compute_ids = msk.data.nonzero()
         if len(compute_ids.size()) > 0:  # not all zeros
             compute_ids = compute_ids.squeeze(1)
             data_select = self.data[compute_ids]
             comp_weight = self.computer(
                 data_select)  # (num_data_select, indim)
             comp_weight = comp_weight.contiguous()
             indim = comp_weight.size(1)
             if self.base_weight is None or self.base_weight.size(
                     1) != indim:
                 self.base_weight = q.var(torch.zeros(1, indim)).cuda(x).v
             weight = torch.cat([self.base_weight, comp_weight], 0)
             index_transform = (torch.cumsum(msk, 0) * msk).long()
             weight = weight.index_select(0, index_transform)
         else:
             data_select = self.data[0:1]
             comp_weight = self.computer(
                 data_select)  # (num_data_select, indim)
             comp_weight = comp_weight.contiguous()
             indim = comp_weight.size(1)
             weight = q.var(torch.zeros(mask.size(1), indim)).cuda(x).v
     else:
         weight = self.computer(self.data)
         weight = weight.contiguous()
     out = torch.mm(x, weight.t())
     if self.bias:
         bias = self.bias if mask is not None else self.bias * mask
         out += bias
     if mask is not None:
         out = out * mask.float()
     return out  #, mask ?
Exemplo n.º 22
0
    def forward(self, x):
        # Set initial states
        h0 = q.var(torch.zeros(self.num_layers, x.size(0),
                               self.hidden_size)).cuda(crit=x).v
        # c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))

        # Forward propagate RNN
        if self.mode == "qrnn":
            out = self.rnn(x)
        else:
            out, _ = self.rnn(x, h0)

        # Returns final state
        out = out[:, -1, :]
        return out
Exemplo n.º 23
0
        def forward(self, x):
            # Set initial states
            h0 = q.var(
                torch.zeros(self.num_layers, x.size(0),
                            self.hidden_size)).cuda(crit=x).v
            #c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))

            # Forward propagate RNN
            if mode == "qrnn" or mode == "stack":
                out = self.rnn(x)
            else:
                out, _ = self.rnn(x, h0)

            # Decode hidden state of last time step
            out = nn.LogSoftmax()(self.fc(out[:, -1, :]))
            return out
Exemplo n.º 24
0
    def forward(self, x, mask=None):
        self.reset_state()
        h_0 = self._get_init_states(x.size(0))
        if self._reverse:
            x = _reverse_seq(x, mask=mask)
            if mask is not None:
                x = x * mask.unsqueeze(2).float()
        y, s_t = self.nnlayer(x, h_0)
        self.set_states(s_t)  # DON'T TRUST FINAL STATES WHEN MASK IS NOT NONE
        #return y

        if mask is None:
            y_t = y[:, -1, :]
        else:
            last = (torch.sum(mask, 1) -
                    1).long()  # (batsize): lengths of sequences - 1
            rng = q.var(torch.arange(0,
                                     x.size(0)).long()).cuda(x).v  # (batsize)
            y_t = y[rng.data, last.data, :]

        # if mask is not None:
        #     self.set_states(y_t)

        ret = tuple()
        if self._return_final:
            ret += (y_t, )
        if self._return_all:
            if self._reverse:
                y = _reverse_seq(y, mask=mask)
            if mask is not None:
                y = y * mask.unsqueeze(2).float()
            ret += (y, )
        if self._return_mask:
            ret += (mask, )

        if len(ret) == 1:
            return ret[0]
        elif len(ret) == 0:
            print("no output specified")
            return
        else:
            return ret
Exemplo n.º 25
0
 def get_init_states(self, arg):
     """
     :param arg: batch size (will generate and return compatible init states) or None (will return what is stored)
     :return: initial states, states that have previously been set or newly generated zero states based on given batch size
     """
     if arg is None:
         return self._init_states
     assert (q.isnumber(arg) or arg is None)  # is batch size
     if self._init_states is None:  # no states have been set using .set_init_states()
         _init_states = [None] * self.numstates
     else:
         _init_states = self._init_states
     # fill up with zeros and expand where necessary
     assert (self.numstates == len(_init_states))
     for i in range(len(_init_states)):
         statespec = self.state_spec[i]
         initstate = _init_states[i]
         if initstate is None:
             state_0 = q.var(torch.zeros(statespec)).cuda(
                 next(self.parameters()).is_cuda).v
             _init_states[i] = state_0
             initstate = state_0
         if initstate.dim(
         ) == 2:  # init state set differently for different batches
             if arg > initstate.size(0):
                 raise Exception(
                     "asked for a bigger batch size than init states")
             elif arg == initstate.size(0):
                 pass
             else:
                 if arg == 0:
                     return initstate[0]
                 else:
                     return initstate[:arg]
         elif initstate.dim() == 1 and arg > 0:
             _init_states[i] = initstate.unsqueeze(0).expand(
                 arg, initstate.size(-1))
         else:
             raise Exception(
                 "initial states set to wrong dimensional values. Must be 1D (will be repeated) or 2D."
             )
     return _init_states
Exemplo n.º 26
0
 def forward(self, x_t, t=None, mask_t=None):
     batsize = x_t.size(0)
     states = self.get_states(batsize)
     ret = self._forward(x_t, *states, t=t)
     y_t = ret[0]
     newstates = ret[1:]
     st = []
     if mask_t is not None:
         mask_t = mask_t.float()
         for newstate, oldstate in zip(newstates, states):
             newstate = newstate * mask_t + oldstate * (1 - mask_t)
             st.append(newstate)
         if mask_t is not None:  # moved from RNNLayer
             if self._y_tm1 is None:
                 self._y_tm1 = q.var(torch.zeros(
                     y_t.size())).cuda(crit=y_t).v
             y_t = y_t * mask_t + self._y_tm1 * (1 - mask_t)
             self._y_tm1 = y_t
     else:
         st = newstates
     self.set_states(*st)
     return y_t
Exemplo n.º 27
0
    def test_dynamic_bypass_stack(self):
        data = q.var(np.random.random((3, 5)).astype(dtype="float32")).v
        stack = q.Stack()
        nlayers = 5
        for i in range(nlayers):
            stack.add(q.argsave.spec(a=0), q.Forward(5, 5), q.Forward(5, 5),
                      q.argmap.spec(0, ["a"]), q.Lambda(lambda x, y: x + y))
        out = stack(data)
        print(out)
        self.assertEqual(out.size(), (3, 5))

        out.sum().backward()

        forwards = []
        for layer in stack.layers:
            if isinstance(layer, q.Forward):
                self.assertTrue(layer.lin.weight.grad is not None)
                self.assertTrue(layer.lin.bias.grad is not None)
                print(layer.lin.weight.grad.norm(2))
                self.assertTrue(layer.lin.weight.grad.norm(2).data[0] > 0)
                self.assertTrue(layer.lin.bias.grad.norm(2).data[0] > 0)
                forwards.append(layer)

        self.assertEqual(len(forwards), nlayers * 2)
Exemplo n.º 28
0
def main(
    lr=0.5,
    epochs=30,
    batsize=32,
    embdim=90,
    encdim=90,
    mode="cell",  # "fast" or "cell"
    wreg=0.0001,
    cuda=False,
    gpu=1,
):
    if cuda:
        torch.cuda.set_device(gpu)
    usecuda = cuda
    vocsize = 50
    # create datasets tensor
    tt.tick("loading data")
    sequences = np.random.randint(0, vocsize, (batsize * 100, 16))
    # wrap in dataset
    dataset = q.TensorDataset(sequences[:batsize * 80],
                              sequences[:batsize * 80])
    validdataset = q.TensorDataset(sequences[batsize * 80:],
                                   sequences[batsize * 80:])
    dataloader = DataLoader(dataset=dataset, batch_size=batsize, shuffle=True)
    validdataloader = DataLoader(dataset=validdataset,
                                 batch_size=batsize,
                                 shuffle=False)
    tt.tock("data loaded")
    # model
    tt.tick("building model")
    embedder = nn.Embedding(vocsize, embdim)

    encoder = q.RecurrentStack(
        embedder,
        q.SRUCell(encdim).to_layer(),
        q.SRUCell(encdim).to_layer(),
        q.SRUCell(encdim).to_layer(),
        q.SRUCell(encdim).to_layer().return_final(),
    )
    if mode == "fast":
        decoder = q.AttentionDecoder(
            attention=q.Attention().forward_gen(encdim, encdim, encdim),
            embedder=embedder,
            core=q.RecurrentStack(q.GRULayer(embdim, encdim)),
            smo=q.Stack(nn.Linear(encdim + encdim, vocsize), q.LogSoftmax()),
            return_att=True)
    else:
        decoder = q.AttentionDecoderCell(
            attention=q.Attention().forward_gen(encdim, encdim + embdim,
                                                encdim),
            embedder=embedder,
            core=q.RecStack(
                q.GRUCell(embdim + encdim,
                          encdim,
                          use_cudnn_cell=False,
                          rec_batch_norm=None,
                          activation="crelu")),
            smo=q.Stack(nn.Linear(encdim + encdim, vocsize), q.LogSoftmax()),
            att_after_update=False,
            ctx_to_decinp=True,
            decinp_to_att=True,
            return_att=True,
        ).to_decoder()

    m = EncDec(encoder, decoder, mode=mode)

    losses = q.lossarray(q.SeqNLLLoss(ignore_index=None),
                         q.SeqAccuracy(ignore_index=None),
                         q.SeqElemAccuracy(ignore_index=None))
    validlosses = q.lossarray(q.SeqNLLLoss(ignore_index=None),
                              q.SeqAccuracy(ignore_index=None),
                              q.SeqElemAccuracy(ignore_index=None))

    optimizer = torch.optim.Adadelta(m.parameters(), lr=lr, weight_decay=wreg)
    tt.tock("model built")

    q.train(m).cuda(usecuda).train_on(dataloader, losses)\
        .set_batch_transformer(lambda x, y: (x, y[:, :-1], y[:, 1:]))\
        .valid_on(validdataloader, validlosses)\
        .optimizer(optimizer).clip_grad_norm(2.)\
        .train(epochs)

    testdat = np.random.randint(0, vocsize, (batsize, 20))
    testdata = q.var(torch.from_numpy(testdat)).cuda(usecuda).v
    testdata_out = q.var(torch.from_numpy(testdat)).cuda(usecuda).v
    if mode == "cell" and False:
        inv_idx = torch.arange(testdata.size(1) - 1, -1, -1).long()
        testdata = testdata.index_select(1, inv_idx)
    probs, attw = m(testdata, testdata_out[:, :-1])

    def plot(x):
        sns.heatmap(x)
        plt.show()

    embed()
Exemplo n.º 29
0
def main(
        # Hyper Parameters
        sequence_length=28,
        input_size=28,
        hidden_size=128,
        num_layers=2,
        num_classes=10,
        batch_size=100,
        num_epochs=2,
        learning_rate=0.01,
        gpu=False,
        mode="qrnn"  # "nn" or "qrnn" or "stack"
):

    tt = ticktock("script")
    tt.msg("using q: {}".format(mode))
    # MNIST Dataset
    train_dataset = dsets.MNIST(root='../../../datasets/mnist/',
                                train=True,
                                transform=transforms.ToTensor(),
                                download=True)

    test_dataset = dsets.MNIST(root='../../../datasets/mnist/',
                               train=False,
                               transform=transforms.ToTensor())

    # Data Loader (Input Pipeline)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True)

    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size,
                                              shuffle=False)

    # RNN Model (Many-to-One)
    class RNN(nn.Module):
        def __init__(self, input_size, hidden_size, num_layers, num_classes):
            super(RNN, self).__init__()
            self.hidden_size = hidden_size
            self.num_layers = num_layers
            if mode == "qrnn":
                tt.msg("using q.RNN")
                self.rnn = RecStack(*[GRUCell(input_size, hidden_size, use_cudnn_cell=False, rec_batch_norm="main")] +
                                     [GRUCell(hidden_size, hidden_size) for i in range(num_layers - 1)])\
                            .to_layer().return_all()
            elif mode == "nn":
                tt.msg("using nn.RNN")
                self.rnn = nn.GRU(input_size,
                                  hidden_size,
                                  num_layers,
                                  batch_first=True)
            elif mode == "stack":
                self.rnn = q.RecurrentStack(
                    *([q.GRULayer(input_size, hidden_size)] + [
                        q.GRULayer(hidden_size, hidden_size)
                        for i in range(num_layers - 1)
                    ]))
            self.fc = nn.Linear(hidden_size, num_classes)

        def forward(self, x):
            # Set initial states
            h0 = q.var(
                torch.zeros(self.num_layers, x.size(0),
                            self.hidden_size)).cuda(crit=x).v
            #c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))

            # Forward propagate RNN
            if mode == "qrnn" or mode == "stack":
                out = self.rnn(x)
            else:
                out, _ = self.rnn(x, h0)

            # Decode hidden state of last time step
            out = nn.LogSoftmax()(self.fc(out[:, -1, :]))
            return out

    if gpu:
        q.var.all_cuda = True
    rnn = RNN(input_size, hidden_size, num_layers, num_classes)
    if gpu:
        rnn.cuda()

    # Loss and Optimizer
    criterion = q.lossarray(nn.NLLLoss())
    if gpu:
        criterion.cuda()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
    q.train(rnn).train_on(train_loader, criterion).cuda(gpu)\
        .optimizer(optimizer).set_batch_transformer(lambda x, y: (x.view(-1, sequence_length, input_size), y))\
        .train(num_epochs)
    # tt.msg("training")
    # # Train the Model
    # for epoch in range(num_epochs):
    #     tt.tick()
    #     btt = ticktock("batch")
    #     btt.tick()
    #     for i, (images, labels) in enumerate(train_loader):
    #         #btt.tick("doing batch")
    #         images = q.var(images.view(-1, sequence_length, input_size)).cuda(crit=gpu).v
    #         labels = q.var(labels).cuda(crit=gpu).v
    #
    #         # Forward + Backward + Optimize
    #         optimizer.zero_grad()
    #         outputs = rnn(images)
    #         loss = criterion(outputs, labels)
    #         loss.backward()
    #         optimizer.step()
    #
    #         if (i+1) % 100 == 0:
    #             btt.tock("100 batches done")
    #             print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
    #                    %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.datasets[0]))
    #             btt.tick()
    #         #tt.tock("batch done")
    #     tt.tock("epoch {} done".format(epoch))
    # Test the Model
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = q.var(images.view(-1, sequence_length,
                                   input_size)).cuda(crit=gpu).v
        labels = q.var(labels).cuda(crit=gpu).v
        outputs = rnn(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.data).sum()

    print('Test Accuracy of the model on the 10000 test images: %d %%' %
          (100 * correct / total))

    # Save the Model
    torch.save(rnn.state_dict(), 'rnn.pkl')
Exemplo n.º 30
0
    def trainloop(self):
        stop = False
        self.tt.tick("training")
        tt = ticktock("-")
        current_epoch = 0
        totaltrainbats = len(self.traindataloader)
        while not stop:
            self.current_epoch = current_epoch
            stop = self.current_epoch + 1 == self.epochs
            self.trainlosses.push_and_reset()
            tt.tick()
            self.model.train()
            for i, batch in enumerate(self.traindataloader):
                self.optim.zero_grad()
                params = self.model.parameters()
                batch = [
                    q.var(batch_e).cuda(self.usecuda).v for batch_e in batch
                ]
                if self.transform_batch is not None:
                    batch = self.transform_batch(*batch)
                modelouts = self.model(*batch[:-1])
                if not issequence(modelouts):
                    modelouts = [modelouts]
                trainlosses = self.trainlosses(modelouts[0], batch[-1])
                trainlosses[0].backward()
                # grad total norm
                tgn0 = None
                if self._clip_grad_norm is not None:
                    tgn0 = nn.utils.clip_grad_norm(self.model.parameters(),
                                                   self._clip_grad_norm)
                if tgn0 is not None:
                    tgn = tgn0
                else:
                    tgn = 0
                    for param in self.model.parameters():
                        tgn += param.grad.pow(
                            2).sum() if param.grad is not None else 0
                    tgn = tgn.pow(1. / 2)
                    tgn = tgn.data[0]

                self.optim.step()

                tt.live(
                    "train - Epoch {}/{} - [{}/{}]: {} - TGN: {:.4f}".format(
                        self.current_epoch + 1, self.epochs, i + 1,
                        totaltrainbats, self.trainlosses.pp(), tgn))
            ttmsg = "Epoch {}/{} -- train: {}"\
                .format(
                    self.current_epoch+1,
                    self.epochs,
                    self.trainlosses.pp()
                )
            train_epoch_losses = self.trainlosses.get_agg_errors()
            valid_epoch_losses = []
            if self.validlosses is not None:
                self.model.eval()
                self.validlosses.push_and_reset()
                totalvalidbats = len(self.validdataloader)
                for i, batch in enumerate(self.validdataloader):
                    batch = [
                        q.var(batch_e).cuda(self.usecuda).v
                        for batch_e in batch
                    ]
                    if self.transform_batch is not None:
                        batch = self.transform_batch(*batch)
                    modelouts = self.model(*batch[:-1])
                    if not issequence(modelouts):
                        modelouts = [modelouts]
                    validlosses = self.validlosses(modelouts[0], batch[-1])
                    tt.live("valid - Epoch {}/{} - [{}/{}]: {}".format(
                        self.current_epoch + 1, self.epochs, i + 1,
                        totalvalidbats, self.validlosses.pp()))
                ttmsg += " -- valid: {}".format(self.validlosses.pp())
                valid_epoch_losses = self.validlosses.get_agg_errors()
            tt.stoplive()
            tt.tock(ttmsg)
            if self._earlystop:
                doearlystop = self.earlystop_eval(train_epoch_losses,
                                                  valid_epoch_losses)
                if doearlystop:
                    tt.msg("stopping early")
                stop = stop or doearlystop
            current_epoch += 1
        self.tt.tock("trained")