示例#1
0
    def __init__(self, args, params=None, attention=False, bidir=False, subset_grad=True, pyramid=False):
        self.rnn_dim = args.rnn_dim
        self.rlayers = args.rlayers
        self.attention = attention

        lr = T.scalar(dtype=floatX)
        pdrop = T.scalar(dtype=floatX)
        max_norm = T.scalar(dtype=floatX)

        # initialize input tensors

        src_sent = T.imatrix("src_sent")
        rev_src_sent = T.imatrix("rev_src_sent")
        src_mask = T.bmatrix("src_mask")
        tgt_sent = T.imatrix("tgt_sent")
        tgt_mask = T.bmatrix("tgt_mask")
        space_mask = T.bmatrix("space_mask")

        # build up model
        # https://groups.google.com/forum/#!topic/torch7/-NBrFw8Q6_s
        # NOTE can't use one-hot here because huge matrix multiply
        self.L_enc = theano.shared(uniform_init(args.src_vocab_size, args.rnn_dim, scale=0.1), "L_enc", borrow=True)
        self.L_dec = theano.shared(uniform_init(args.tgt_vocab_size, args.rnn_dim, scale=0.1), "L_dec", borrow=True)
        enc_input = src_sent if not args.reverse else rev_src_sent
        if bidir:
            print("Using bidirectional encoder")
            self.encoder = BiRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T, space_mask.T, self.L_enc, pdrop, args)
        elif pyramid:
            print("Using pyramid encoder")
            self.encoder = BiPyrRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T, self.L_enc, pdrop, args)
        else:
            self.encoder = RNNEncoder(enc_input.T, src_mask.T, space_mask.T, self.L_enc, pdrop, args)
        if attention:
            self.decoder = RNNDecoderAttention(self.encoder, tgt_sent.T, tgt_mask.T, self.L_dec, pdrop, args)
            hs = self.decoder.hs
        else:
            self.decoder = RNNDecoder(self.encoder.out, tgt_sent.T, tgt_mask.T, self.L_dec, pdrop, args)

        # cost, parameters, grads, updates

        self.cost = self.decoder.cost
        self.params = self.encoder.params + self.decoder.params + [self.L_enc, self.L_dec]
        if subset_grad:
            self.grad_params = self.encoder.params + self.decoder.params + [self.encoder.subset, self.decoder.subset]
            self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(
                self.cost, self.grad_params, lr, max_norm=max_norm
            )
            # instead of updating L_enc and L_dec only want to update the embeddings indexed, so use inc_subtensor/set_subtensor
            # http://deeplearning.net/software/theano/tutorial/faq_tutorial.html
            self.updates[-2] = (self.L_enc, T.set_subtensor(self.updates[-2][0], self.updates[-2][1]))
            self.updates[-1] = (self.L_dec, T.set_subtensor(self.updates[-1][0], self.updates[-1][1]))
        else:
            self.grad_params = self.params
            self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(
                self.cost, self.grad_params, lr, max_norm=max_norm
            )

        self.nparams = np.sum([np.prod(p.shape.eval()) for p in self.params])

        # functions

        self.train = theano.function(
            inputs=[src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask, pdrop, lr, max_norm],
            outputs=[self.cost, self.grad_norm, self.param_norm],
            updates=self.updates,
            on_unused_input="warn",
            allow_input_downcast=True,
        )
        self.test = theano.function(
            inputs=[src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask, theano.In(pdrop, value=0.0)],
            outputs=self.cost,
            updates=None,
            on_unused_input="warn",
        )
        outputs = self.encoder.out
        if attention:
            outputs = self.encoder.out + [hs]
        self.encode = theano.function(
            inputs=[src_sent, rev_src_sent, src_mask, space_mask, theano.In(pdrop, value=0.0)],
            outputs=outputs,
            on_unused_input="warn",
            updates=None,
        )

        # function for decoding step by step

        i_t = T.ivector()
        x_t = self.L_dec[i_t, :]
        h_ps = list()  # previous
        for k in xrange(args.rlayers):
            h_ps.append(T.matrix())
        h_ts = list()
        dmask = T.ones_like(h_ps[0]).astype(floatX)
        if attention and args.rlayers == 1:
            h_t, _ = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0], hs)
        else:
            h_t = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0])
        h_ts.append(h_t)
        # NOTE no more dropout nodes here
        for k in xrange(1, args.rlayers):
            if attention and args.rlayers == k + 1:
                h_t, align = self.decoder.rlayers[k]._step(h_t, dmask, h_ps[k], hs)
            else:
                h_t = self.decoder.rlayers[k]._step(h_t, dmask, h_ps[k])
            h_ts.append(h_t)
        E_t = T.dot(h_t, self.decoder.olayer.W) + self.decoder.olayer.b
        E_t = T.exp(E_t - T.max(E_t, axis=1, keepdims=True))
        p_t = E_t / E_t.sum(axis=1, keepdims=True)
        inputs = [i_t] + h_ps
        outputs = [p_t] + h_ts
        if attention:
            inputs = inputs + [hs]
            outputs = outputs + [align]
        self.decode_step = theano.function(inputs=inputs, outputs=outputs, updates=None)
示例#2
0
文件: clm.py 项目: BinbinBian/Parable
    def __init__(self, args):
        self.args = args
        x = T.imatrix('x')
        y = T.imatrix('y')
        mask = T.ones_like(x).astype(floatX)
        # FIXME TODO resume from last state of previous sequence instead o
        # resetting the first hidden state to 0s
        self.unit = args.unit
        if args.unit == 'gru':
            init_states = [T.matrix(dtype=floatX) for k in xrange(args.rlayers)]
        elif args.unit == 'lstm':
            init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX)) for k in xrange(args.rlayers)]
        else:
            assert(False)
        lr = T.scalar(dtype=floatX)
        pdrop = T.scalar(dtype=floatX)

        rlayers = list()
        inp = theano.tensor.extra_ops.to_one_hot(x.flatten(), args.vocab_size).astype(floatX).reshape((x.shape[0], x.shape[1], args.vocab_size))
        seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop)
        # exclude last prediction
        inplayer = UnitInit[args.unit](inp.astype(floatX), mask, seqmask, args.vocab_size, init_states[0], args, suffix='0')
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop)
            rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out, mask, seqmask, args.rnn_dim, init_states[k], args, suffix='%d' % k)
            rlayers.append(rlayer)
        olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim,
                args.vocab_size)
        self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False)
        super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost)
        shapes = [p.shape.eval() for p in self.params]
        sizes = [np.prod(s) for s in shapes]
        self.nparams = np.sum(sizes)
        self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm)

        # functions

        if args.unit == 'lstm':
            init_states = flatten(init_states)
            final_states = list()
            for r in rlayers:
                final_states.append(r.out[-1])
                final_states.append(r.cell[-1])
        else:
            final_states = [r.out[-1] for r in rlayers]

        self.train = theano.function(
            inputs=[x, y, pdrop, lr] + init_states,
            outputs=[self.cost, self.grad_norm, self.param_norm] + final_states,
            updates = self.updates,
            on_unused_input='warn'
        )

        self.test = theano.function(
            # at test time should pass in pdrop=0
            inputs=[x, y, pdrop] + init_states,
            outputs=[self.cost] + final_states,
            updates = None,
            on_unused_input='warn'
        )

        # function for sampling

        i_t = T.ivector()
        x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0]
        h_ps = list()  # previous
        for k in xrange(args.rlayers):
            if args.unit == 'gru':
                h_ps.append(T.vector())
                dmask = T.ones_like(h_ps[0]).astype(floatX)
            else:
                h_ps.append((T.vector(), T.vector()))
                dmask = T.ones_like(h_ps[0][0]).astype(floatX)
        h_ts = list()
        if args.unit == 'lstm':
            h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0])
        else:
            h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0])
        h_ts.append(h_t)
        for k in xrange(1, args.rlayers):
            if args.unit == 'lstm':
                h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k])
            else:
                h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k])
            h_ts.append(h_t)
        if args.unit == 'lstm':
            h_t = h_t[0]
        E_t = T.dot(h_t, self.olayer.W) + self.olayer.b
        E_t = T.exp(E_t - T.max(E_t))
        p_t = E_t / E_t.sum()
        if args.unit == 'lstm':
            h_ps = flatten(h_ps)
            h_ts = flatten(h_ts)
        self.decode_step = theano.function(
            inputs=[i_t] + h_ps,
            outputs=[p_t] + h_ts,
            updates=None,
            on_unused_input='warn'
        )
示例#3
0
    def __init__(self, args):
        self.args = args
        x = T.imatrix('x')
        y = T.imatrix('y')
        mask = T.ones_like(x).astype(floatX)
        # FIXME TODO resume from last state of previous sequence instead o
        # resetting the first hidden state to 0s
        self.unit = args.unit
        if args.unit == 'gru':
            init_states = [
                T.matrix(dtype=floatX) for k in xrange(args.rlayers)
            ]
        elif args.unit == 'lstm':
            init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX))
                           for k in xrange(args.rlayers)]
        else:
            assert (False)
        lr = T.scalar(dtype=floatX)
        pdrop = T.scalar(dtype=floatX)

        rlayers = list()
        inp = theano.tensor.extra_ops.to_one_hot(
            x.flatten(), args.vocab_size).astype(floatX).reshape(
                (x.shape[0], x.shape[1], args.vocab_size))
        seqmask = get_sequence_dropout_mask(
            (inp.shape[0], inp.shape[1], args.rnn_dim),
            pdrop,
            stocdrop=args.stocdrop)
        # exclude last prediction
        inplayer = UnitInit[args.unit](inp.astype(floatX),
                                       mask,
                                       seqmask,
                                       args.vocab_size,
                                       init_states[0],
                                       args,
                                       suffix='0')
        rlayers.append(inplayer)
        for k in xrange(1, args.rlayers):
            seqmask = get_sequence_dropout_mask(
                (inp.shape[0], inp.shape[1], args.rnn_dim),
                pdrop,
                stocdrop=args.stocdrop)
            rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out,
                                         mask,
                                         seqmask,
                                         args.rnn_dim,
                                         init_states[k],
                                         args,
                                         suffix='%d' % k)
            rlayers.append(rlayer)
        olayer = SequenceLogisticRegression(
            Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.vocab_size)
        self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False)
        super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost)
        shapes = [p.shape.eval() for p in self.params]
        sizes = [np.prod(s) for s in shapes]
        self.nparams = np.sum(sizes)
        self.updates, self.grad_norm, self.param_norm = get_opt_fn(
            args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm)

        # functions

        if args.unit == 'lstm':
            init_states = flatten(init_states)
            final_states = list()
            for r in rlayers:
                final_states.append(r.out[-1])
                final_states.append(r.cell[-1])
        else:
            final_states = [r.out[-1] for r in rlayers]

        self.train = theano.function(
            inputs=[x, y, pdrop, lr] + init_states,
            outputs=[self.cost, self.grad_norm, self.param_norm] +
            final_states,
            updates=self.updates,
            on_unused_input='warn')

        self.test = theano.function(
            # at test time should pass in pdrop=0
            inputs=[x, y, pdrop] + init_states,
            outputs=[self.cost] + final_states,
            updates=None,
            on_unused_input='warn')

        # function for sampling

        i_t = T.ivector()
        x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0]
        h_ps = list()  # previous
        for k in xrange(args.rlayers):
            if args.unit == 'gru':
                h_ps.append(T.vector())
                dmask = T.ones_like(h_ps[0]).astype(floatX)
            else:
                h_ps.append((T.vector(), T.vector()))
                dmask = T.ones_like(h_ps[0][0]).astype(floatX)
        h_ts = list()
        if args.unit == 'lstm':
            h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0])
        else:
            h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0])
        h_ts.append(h_t)
        for k in xrange(1, args.rlayers):
            if args.unit == 'lstm':
                h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k])
            else:
                h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k])
            h_ts.append(h_t)
        if args.unit == 'lstm':
            h_t = h_t[0]
        E_t = T.dot(h_t, self.olayer.W) + self.olayer.b
        E_t = T.exp(E_t - T.max(E_t))
        p_t = E_t / E_t.sum()
        if args.unit == 'lstm':
            h_ps = flatten(h_ps)
            h_ts = flatten(h_ts)
        self.decode_step = theano.function(inputs=[i_t] + h_ps,
                                           outputs=[p_t] + h_ts,
                                           updates=None,
                                           on_unused_input='warn')
示例#4
0
    def __init__(self,
                 args,
                 params=None,
                 attention=False,
                 bidir=False,
                 subset_grad=True,
                 pyramid=False):
        self.rnn_dim = args.rnn_dim
        self.rlayers = args.rlayers
        self.attention = attention

        lr = T.scalar(dtype=floatX)
        pdrop = T.scalar(dtype=floatX)
        max_norm = T.scalar(dtype=floatX)

        # initialize input tensors

        src_sent = T.imatrix('src_sent')
        rev_src_sent = T.imatrix('rev_src_sent')
        src_mask = T.bmatrix('src_mask')
        tgt_sent = T.imatrix('tgt_sent')
        tgt_mask = T.bmatrix('tgt_mask')
        space_mask = T.bmatrix('space_mask')

        # build up model
        # https://groups.google.com/forum/#!topic/torch7/-NBrFw8Q6_s
        # NOTE can't use one-hot here because huge matrix multiply
        self.L_enc = theano.shared(uniform_init(args.src_vocab_size,
                                                args.rnn_dim,
                                                scale=0.1),
                                   'L_enc',
                                   borrow=True)
        self.L_dec = theano.shared(uniform_init(args.tgt_vocab_size,
                                                args.rnn_dim,
                                                scale=0.1),
                                   'L_dec',
                                   borrow=True)
        enc_input = src_sent if not args.reverse else rev_src_sent
        if bidir:
            print('Using bidirectional encoder')
            self.encoder = BiRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T,
                                        space_mask.T, self.L_enc, pdrop, args)
        elif pyramid:
            print('Using pyramid encoder')
            self.encoder = BiPyrRNNEncoder(src_sent.T, rev_src_sent.T,
                                           src_mask.T, self.L_enc, pdrop, args)
        else:
            self.encoder = RNNEncoder(enc_input.T, src_mask.T, space_mask.T,
                                      self.L_enc, pdrop, args)
        if attention:
            self.decoder = RNNDecoderAttention(self.encoder, tgt_sent.T,
                                               tgt_mask.T, self.L_dec, pdrop,
                                               args)
            hs = self.decoder.hs
        else:
            self.decoder = RNNDecoder(self.encoder.out, tgt_sent.T, tgt_mask.T,
                                      self.L_dec, pdrop, args)

        # cost, parameters, grads, updates

        self.cost = self.decoder.cost
        self.params = self.encoder.params + self.decoder.params + [
            self.L_enc, self.L_dec
        ]
        if subset_grad:
            self.grad_params = self.encoder.params + self.decoder.params + [
                self.encoder.subset, self.decoder.subset
            ]
            self.updates, self.grad_norm, self.param_norm = get_opt_fn(
                args.optimizer)(self.cost,
                                self.grad_params,
                                lr,
                                max_norm=max_norm)
            # instead of updating L_enc and L_dec only want to update the embeddings indexed, so use inc_subtensor/set_subtensor
            # http://deeplearning.net/software/theano/tutorial/faq_tutorial.html
            self.updates[-2] = (self.L_enc,
                                T.set_subtensor(self.updates[-2][0],
                                                self.updates[-2][1]))
            self.updates[-1] = (self.L_dec,
                                T.set_subtensor(self.updates[-1][0],
                                                self.updates[-1][1]))
        else:
            self.grad_params = self.params
            self.updates, self.grad_norm, self.param_norm = get_opt_fn(
                args.optimizer)(self.cost,
                                self.grad_params,
                                lr,
                                max_norm=max_norm)

        self.nparams = np.sum([np.prod(p.shape.eval()) for p in self.params])

        # functions

        self.train = theano.function(
            inputs=[
                src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask,
                space_mask, pdrop, lr, max_norm
            ],
            outputs=[self.cost, self.grad_norm, self.param_norm],
            updates=self.updates,
            on_unused_input='warn',
            allow_input_downcast=True)
        self.test = theano.function(inputs=[
            src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask,
            theano.In(pdrop, value=0.0)
        ],
                                    outputs=self.cost,
                                    updates=None,
                                    on_unused_input='warn')
        outputs = self.encoder.out
        if attention:
            outputs = self.encoder.out + [hs]
        self.encode = theano.function(inputs=[
            src_sent, rev_src_sent, src_mask, space_mask,
            theano.In(pdrop, value=0.0)
        ],
                                      outputs=outputs,
                                      on_unused_input='warn',
                                      updates=None)

        # function for decoding step by step

        i_t = T.ivector()
        x_t = self.L_dec[i_t, :]
        h_ps = list()  # previous
        for k in xrange(args.rlayers):
            h_ps.append(T.matrix())
        h_ts = list()
        dmask = T.ones_like(h_ps[0]).astype(floatX)
        if attention and args.rlayers == 1:
            h_t, _ = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0], hs)
        else:
            h_t = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0])
        h_ts.append(h_t)
        # NOTE no more dropout nodes here
        for k in xrange(1, args.rlayers):
            if attention and args.rlayers == k + 1:
                h_t, align = self.decoder.rlayers[k]._step(
                    h_t, dmask, h_ps[k], hs)
            else:
                h_t = self.decoder.rlayers[k]._step(h_t, dmask, h_ps[k])
            h_ts.append(h_t)
        E_t = T.dot(h_t, self.decoder.olayer.W) + self.decoder.olayer.b
        E_t = T.exp(E_t - T.max(E_t, axis=1, keepdims=True))
        p_t = E_t / E_t.sum(axis=1, keepdims=True)
        inputs = [i_t] + h_ps
        outputs = [p_t] + h_ts
        if attention:
            inputs = inputs + [hs]
            outputs = outputs + [align]
        self.decode_step = theano.function(inputs=inputs,
                                           outputs=outputs,
                                           updates=None)