def __init__(self, rep, y, mask, L_dec, pdrop, args): self.h0s = rep outputs_info = self.h0s rlayers = list() self.subset = L_dec[y.flatten()] inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1])) seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], L_dec.shape[1]), pdrop) # exclude last prediction inplayer = GRULayer( inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix="dec0" ) rlayers.append(inplayer) for k in xrange(1, args.rlayers): seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], args.rnn_dim), pdrop) rlayer = GRULayer( Dropout(rlayers[-1].out, pdrop).out, mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[k], args, suffix="dec%d" % k, ) rlayers.append(rlayer) olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size) cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False) super(RNNDecoder, self).__init__(rlayers, olayer, cost)
def __init__(self, rep, y, mask, L_dec, pdrop, args): self.h0s = rep outputs_info = self.h0s rlayers = list() self.subset = L_dec[y.flatten()] inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1])) seqmask = get_sequence_dropout_mask( (y.shape[0], y.shape[1], L_dec.shape[1]), pdrop) # exclude last prediction inplayer = GRULayer(inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix='dec0') rlayers.append(inplayer) for k in xrange(1, args.rlayers): seqmask = get_sequence_dropout_mask( (y.shape[0], y.shape[1], args.rnn_dim), pdrop) rlayer = GRULayer(Dropout(rlayers[-1].out, pdrop).out, mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[k], args, suffix='dec%d' % k) rlayers.append(rlayer) olayer = SequenceLogisticRegression( Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size) cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False) super(RNNDecoder, self).__init__(rlayers, olayer, cost)
def __init__(self, x, xr, mask, space_mask, L_enc, pdrop, args): # NOTE shape[1] is batch size since shape[0] is seq length outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)] flayers = list() blayers = list() fsubset = L_enc[x.flatten()] bsubset = L_enc[xr.flatten()] finp = fsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) binp = bsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) fseqmask = get_sequence_dropout_mask( (x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) bseqmask = get_sequence_dropout_mask( (x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) finplayer = GRULayer(finp.astype(floatX), mask, fseqmask, args.rnn_dim, outputs_info, args, suffix='fenc0') binplayer = GRULayer(binp.astype(floatX), mask, bseqmask, args.rnn_dim, outputs_info, args, suffix='benc0', backwards=True) flayers.append(finplayer) blayers.append(binplayer) self.routs = list() # unlike RNNEncoder, contains hs, not just final h self.routs.append(finplayer.out + binplayer.out) for k in xrange(1, args.rlayers): inp = self.routs[-1] fseqmask = get_sequence_dropout_mask( (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop) bseqmask = get_sequence_dropout_mask( (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop) flayer = GRULayer(Dropout(inp, pdrop).out, mask, fseqmask, args.rnn_dim, outputs_info, args, suffix='fenc%d' % k) blayer = GRULayer(Dropout(inp, pdrop).out, mask, bseqmask, args.rnn_dim, outputs_info, args, suffix='benc%d' % k, backwards=True) self.routs.append(flayer.out + blayer.out) flayers.append(flayer) blayers.append(blayer) self.hs = self.routs[-1] # for attention olayer = LayerWrapper(self.routs) rlayers = flayers + blayers # NOTE careful not to assume rlayers = # layers in all cases super(BiRNNEncoder, self).__init__(rlayers, olayer)
def __init__(self, x, mask, space_mask, L_enc, pdrop, args, suffix_prefix='enc', backwards=False): # NOTE shape[1] is batch size since shape[0] is seq length outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)] rlayers = list() self.subset = L_enc[x.flatten()] inp = self.subset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) seqmask = get_sequence_dropout_mask( (x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) inplayer = GRULayer(inp.astype(floatX), mask, seqmask, args.rnn_dim, outputs_info, args, suffix='%s0' % suffix_prefix, backwards=backwards) rlayers.append(inplayer) for k in xrange(1, args.rlayers): inp = rlayers[-1].out seqmask = get_sequence_dropout_mask( (x.shape[0], x.shape[1], args.rnn_dim), pdrop) rlayer = GRULayer(Dropout(inp, pdrop).out, mask, seqmask, args.rnn_dim, outputs_info, args, suffix='%s%d' % (suffix_prefix, k), backwards=backwards) rlayers.append(rlayer) # should extract final outputs according to mask, note we # don't know seq length or current batch size at graph construction time # NOTE this would be used for initial hidden states in decoder in standard seq2seq but currently unused lens = T.sum(mask, axis=0) # will extract A[lens[k], k, :] for k in [0, batch size) self.routs = list() for rlayer in rlayers: rout = rlayer.out[lens - 1, theano.tensor.arange(x.shape[1]), :].astype( floatX) self.routs.append(rout) self.hs = rlayers[-1].out # for attention olayer = LayerWrapper(self.routs) super(RNNEncoder, self).__init__(rlayers, olayer)
def __init__(self, x, xr, mask, L_enc, pdrop, args): # NOTE shape[1] is batch size since shape[0] is seq length outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)] flayers = list() blayers = list() fsubset = L_enc[x.flatten()] bsubset = L_enc[xr.flatten()] finp = fsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) binp = bsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) fseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) bseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) finplayer = GRULayer(finp.astype(floatX), mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc0") binplayer = GRULayer( binp.astype(floatX), mask, bseqmask, args.rnn_dim, outputs_info, args, suffix="benc0", backwards=True ) flayers.append(finplayer) blayers.append(binplayer) self.routs = list() # unlike RNNEncoder, contains hs, not just final h self.routs.append(finplayer.out + binplayer.out) downs = [] for k in xrange(1, args.rlayers): # concatenate consecutive steps in the sequence (which are downscaled to half from the previous layer) d = Downscale(self.routs[-1], args.rnn_dim, suffix="ds%d" % k) downs.append(d) inp = d.out twocols = mask.T.reshape([-1, 2]) mask = T.or_(twocols[:, 0], twocols[:, 1]).reshape([mask.shape[1], -1]).T fseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop) bseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop) flayer = GRULayer( Dropout(inp, pdrop).out, mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc%d" % k ) blayer = GRULayer( Dropout(inp, pdrop).out, mask, bseqmask, args.rnn_dim, outputs_info, args, suffix="benc%d" % k, backwards=True, ) self.routs.append(flayer.out + blayer.out) flayers.append(flayer) blayers.append(blayer) self.hs = self.routs[-1] # for attention olayer = LayerWrapper(self.routs) rlayers = flayers + blayers # NOTE careful not to assume rlayers = # layers in all cases # undo the temporary hack super(BiPyrRNNEncoder, self).__init__(rlayers, olayer, downscales=downs)
def __init__(self, encoder, y, mask, L_dec, pdrop, args): self.hs = encoder.hs # NOTE just use this so only last layer uses attention def layer_init(attention): if not attention: return GRULayer else: return lambda *largs, **kwargs: GRULayerAttention( self.hs, *largs, **kwargs) # initial states outputs_info = [ T.zeros_like(self.hs[0]) for k in xrange(len(encoder.routs)) ] rlayers = list() self.subset = L_dec[y.flatten()] inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1])) attention = args.rlayers == 1 # exclude last prediction seqmask = get_sequence_dropout_mask( (y.shape[0], y.shape[1], L_dec.shape[1]), pdrop) inplayer = layer_init(attention)(inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix='dec0') rlayers.append(inplayer) for k in xrange(1, args.rlayers): attention = (args.rlayers == k + 1) seqmask = get_sequence_dropout_mask( (y.shape[0], y.shape[1], args.rnn_dim), pdrop) rlayer = layer_init(attention)(Dropout(rlayers[-1].out, pdrop).out, mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[k], args, suffix='dec%d' % k) rlayers.append(rlayer) olayer = SequenceLogisticRegression( Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size) cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False) super(RNNDecoderAttention, self).__init__(rlayers, olayer, cost)
def __init__(self, x, mask, space_mask, L_enc, pdrop, args, suffix_prefix="enc", backwards=False): # NOTE shape[1] is batch size since shape[0] is seq length outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)] rlayers = list() self.subset = L_enc[x.flatten()] inp = self.subset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) seqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) inplayer = GRULayer( inp.astype(floatX), mask, seqmask, args.rnn_dim, outputs_info, args, suffix="%s0" % suffix_prefix, backwards=backwards, ) rlayers.append(inplayer) for k in xrange(1, args.rlayers): inp = rlayers[-1].out seqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], args.rnn_dim), pdrop) rlayer = GRULayer( Dropout(inp, pdrop).out, mask, seqmask, args.rnn_dim, outputs_info, args, suffix="%s%d" % (suffix_prefix, k), backwards=backwards, ) rlayers.append(rlayer) # should extract final outputs according to mask, note we # don't know seq length or current batch size at graph construction time # NOTE this would be used for initial hidden states in decoder in standard seq2seq but currently unused lens = T.sum(mask, axis=0) # will extract A[lens[k], k, :] for k in [0, batch size) self.routs = list() for rlayer in rlayers: rout = rlayer.out[lens - 1, theano.tensor.arange(x.shape[1]), :].astype(floatX) self.routs.append(rout) self.hs = rlayers[-1].out # for attention olayer = LayerWrapper(self.routs) super(RNNEncoder, self).__init__(rlayers, olayer)
def __init__(self, x, xr, mask, space_mask, L_enc, pdrop, args): # NOTE shape[1] is batch size since shape[0] is seq length outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)] flayers = list() blayers = list() fsubset = L_enc[x.flatten()] bsubset = L_enc[xr.flatten()] finp = fsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) binp = bsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) fseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) bseqmask = get_sequence_dropout_mask((x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) finplayer = GRULayer(finp.astype(floatX), mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc0") binplayer = GRULayer( binp.astype(floatX), mask, bseqmask, args.rnn_dim, outputs_info, args, suffix="benc0", backwards=True ) flayers.append(finplayer) blayers.append(binplayer) self.routs = list() # unlike RNNEncoder, contains hs, not just final h self.routs.append(finplayer.out + binplayer.out) for k in xrange(1, args.rlayers): inp = self.routs[-1] fseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop) bseqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop) flayer = GRULayer( Dropout(inp, pdrop).out, mask, fseqmask, args.rnn_dim, outputs_info, args, suffix="fenc%d" % k ) blayer = GRULayer( Dropout(inp, pdrop).out, mask, bseqmask, args.rnn_dim, outputs_info, args, suffix="benc%d" % k, backwards=True, ) self.routs.append(flayer.out + blayer.out) flayers.append(flayer) blayers.append(blayer) self.hs = self.routs[-1] # for attention olayer = LayerWrapper(self.routs) rlayers = flayers + blayers # NOTE careful not to assume rlayers = # layers in all cases super(BiRNNEncoder, self).__init__(rlayers, olayer)
def __init__(self, encoder, y, mask, L_dec, pdrop, args): self.hs = encoder.hs # NOTE just use this so only last layer uses attention def layer_init(attention): if not attention: return GRULayer else: return lambda *largs, **kwargs: GRULayerAttention(self.hs, *largs, **kwargs) # initial states outputs_info = [T.zeros_like(self.hs[0]) for k in xrange(len(encoder.routs))] rlayers = list() self.subset = L_dec[y.flatten()] inp = self.subset.reshape((y.shape[0], y.shape[1], L_dec.shape[1])) attention = args.rlayers == 1 # exclude last prediction seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], L_dec.shape[1]), pdrop) inplayer = layer_init(attention)( inp[:-1].astype(floatX), mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[0], args, suffix="dec0" ) rlayers.append(inplayer) for k in xrange(1, args.rlayers): attention = args.rlayers == k + 1 seqmask = get_sequence_dropout_mask((y.shape[0], y.shape[1], args.rnn_dim), pdrop) rlayer = layer_init(attention)( Dropout(rlayers[-1].out, pdrop).out, mask[:-1], seqmask[:-1], args.rnn_dim, outputs_info[k], args, suffix="dec%d" % k, ) rlayers.append(rlayer) olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.tgt_vocab_size) cost = seq_cat_crossent(olayer.out, y[1:], mask[1:], normalize=False) super(RNNDecoderAttention, self).__init__(rlayers, olayer, cost)
def __init__(self, args): self.args = args x = T.imatrix('x') y = T.imatrix('y') mask = T.ones_like(x).astype(floatX) # FIXME TODO resume from last state of previous sequence instead o # resetting the first hidden state to 0s self.unit = args.unit if args.unit == 'gru': init_states = [ T.matrix(dtype=floatX) for k in xrange(args.rlayers) ] elif args.unit == 'lstm': init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX)) for k in xrange(args.rlayers)] else: assert (False) lr = T.scalar(dtype=floatX) pdrop = T.scalar(dtype=floatX) rlayers = list() inp = theano.tensor.extra_ops.to_one_hot( x.flatten(), args.vocab_size).astype(floatX).reshape( (x.shape[0], x.shape[1], args.vocab_size)) seqmask = get_sequence_dropout_mask( (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) # exclude last prediction inplayer = UnitInit[args.unit](inp.astype(floatX), mask, seqmask, args.vocab_size, init_states[0], args, suffix='0') rlayers.append(inplayer) for k in xrange(1, args.rlayers): seqmask = get_sequence_dropout_mask( (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out, mask, seqmask, args.rnn_dim, init_states[k], args, suffix='%d' % k) rlayers.append(rlayer) olayer = SequenceLogisticRegression( Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.vocab_size) self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False) super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost) shapes = [p.shape.eval() for p in self.params] sizes = [np.prod(s) for s in shapes] self.nparams = np.sum(sizes) self.updates, self.grad_norm, self.param_norm = get_opt_fn( args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm) # functions if args.unit == 'lstm': init_states = flatten(init_states) final_states = list() for r in rlayers: final_states.append(r.out[-1]) final_states.append(r.cell[-1]) else: final_states = [r.out[-1] for r in rlayers] self.train = theano.function( inputs=[x, y, pdrop, lr] + init_states, outputs=[self.cost, self.grad_norm, self.param_norm] + final_states, updates=self.updates, on_unused_input='warn') self.test = theano.function( # at test time should pass in pdrop=0 inputs=[x, y, pdrop] + init_states, outputs=[self.cost] + final_states, updates=None, on_unused_input='warn') # function for sampling i_t = T.ivector() x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0] h_ps = list() # previous for k in xrange(args.rlayers): if args.unit == 'gru': h_ps.append(T.vector()) dmask = T.ones_like(h_ps[0]).astype(floatX) else: h_ps.append((T.vector(), T.vector())) dmask = T.ones_like(h_ps[0][0]).astype(floatX) h_ts = list() if args.unit == 'lstm': h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0]) else: h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0]) h_ts.append(h_t) for k in xrange(1, args.rlayers): if args.unit == 'lstm': h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k]) else: h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k]) h_ts.append(h_t) if args.unit == 'lstm': h_t = h_t[0] E_t = T.dot(h_t, self.olayer.W) + self.olayer.b E_t = T.exp(E_t - T.max(E_t)) p_t = E_t / E_t.sum() if args.unit == 'lstm': h_ps = flatten(h_ps) h_ts = flatten(h_ts) self.decode_step = theano.function(inputs=[i_t] + h_ps, outputs=[p_t] + h_ts, updates=None, on_unused_input='warn')
def __init__(self, args): self.args = args x = T.imatrix('x') y = T.imatrix('y') mask = T.ones_like(x).astype(floatX) # FIXME TODO resume from last state of previous sequence instead o # resetting the first hidden state to 0s self.unit = args.unit if args.unit == 'gru': init_states = [T.matrix(dtype=floatX) for k in xrange(args.rlayers)] elif args.unit == 'lstm': init_states = [(T.matrix(dtype=floatX), T.matrix(dtype=floatX)) for k in xrange(args.rlayers)] else: assert(False) lr = T.scalar(dtype=floatX) pdrop = T.scalar(dtype=floatX) rlayers = list() inp = theano.tensor.extra_ops.to_one_hot(x.flatten(), args.vocab_size).astype(floatX).reshape((x.shape[0], x.shape[1], args.vocab_size)) seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) # exclude last prediction inplayer = UnitInit[args.unit](inp.astype(floatX), mask, seqmask, args.vocab_size, init_states[0], args, suffix='0') rlayers.append(inplayer) for k in xrange(1, args.rlayers): seqmask = get_sequence_dropout_mask((inp.shape[0], inp.shape[1], args.rnn_dim), pdrop, stocdrop=args.stocdrop) rlayer = UnitInit[args.unit](Dropout(rlayers[-1].out, pdrop).out, mask, seqmask, args.rnn_dim, init_states[k], args, suffix='%d' % k) rlayers.append(rlayer) olayer = SequenceLogisticRegression(Dropout(rlayers[-1].out, pdrop).out, args.rnn_dim, args.vocab_size) self.cost = seq_cat_crossent(olayer.out, y, mask, normalize=False) super(RNNLM, self).__init__(rlayers, olayer, cost=self.cost) shapes = [p.shape.eval() for p in self.params] sizes = [np.prod(s) for s in shapes] self.nparams = np.sum(sizes) self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(self.cost, self.params, lr, max_norm=args.max_norm) # functions if args.unit == 'lstm': init_states = flatten(init_states) final_states = list() for r in rlayers: final_states.append(r.out[-1]) final_states.append(r.cell[-1]) else: final_states = [r.out[-1] for r in rlayers] self.train = theano.function( inputs=[x, y, pdrop, lr] + init_states, outputs=[self.cost, self.grad_norm, self.param_norm] + final_states, updates = self.updates, on_unused_input='warn' ) self.test = theano.function( # at test time should pass in pdrop=0 inputs=[x, y, pdrop] + init_states, outputs=[self.cost] + final_states, updates = None, on_unused_input='warn' ) # function for sampling i_t = T.ivector() x_t = theano.tensor.extra_ops.to_one_hot(i_t, args.vocab_size)[0] h_ps = list() # previous for k in xrange(args.rlayers): if args.unit == 'gru': h_ps.append(T.vector()) dmask = T.ones_like(h_ps[0]).astype(floatX) else: h_ps.append((T.vector(), T.vector())) dmask = T.ones_like(h_ps[0][0]).astype(floatX) h_ts = list() if args.unit == 'lstm': h_t = self.rlayers[0]._step(x_t, dmask, *h_ps[0]) else: h_t = self.rlayers[0]._step(x_t, dmask, h_ps[0]) h_ts.append(h_t) for k in xrange(1, args.rlayers): if args.unit == 'lstm': h_t = self.rlayers[k]._step(h_t[0], dmask, *h_ps[k]) else: h_t = self.rlayers[k]._step(h_t, dmask, h_ps[k]) h_ts.append(h_t) if args.unit == 'lstm': h_t = h_t[0] E_t = T.dot(h_t, self.olayer.W) + self.olayer.b E_t = T.exp(E_t - T.max(E_t)) p_t = E_t / E_t.sum() if args.unit == 'lstm': h_ps = flatten(h_ps) h_ts = flatten(h_ts) self.decode_step = theano.function( inputs=[i_t] + h_ps, outputs=[p_t] + h_ts, updates=None, on_unused_input='warn' )
def __init__(self, x, xr, mask, L_enc, pdrop, args): # NOTE shape[1] is batch size since shape[0] is seq length outputs_info = [T.zeros((x.shape[1], args.rnn_dim)).astype(floatX)] flayers = list() blayers = list() fsubset = L_enc[x.flatten()] bsubset = L_enc[xr.flatten()] finp = fsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) binp = bsubset.reshape((x.shape[0], x.shape[1], L_enc.shape[1])) fseqmask = get_sequence_dropout_mask( (x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) bseqmask = get_sequence_dropout_mask( (x.shape[0], x.shape[1], L_enc.shape[1]), pdrop) finplayer = GRULayer(finp.astype(floatX), mask, fseqmask, args.rnn_dim, outputs_info, args, suffix='fenc0') binplayer = GRULayer(binp.astype(floatX), mask, bseqmask, args.rnn_dim, outputs_info, args, suffix='benc0', backwards=True) flayers.append(finplayer) blayers.append(binplayer) self.routs = list() # unlike RNNEncoder, contains hs, not just final h self.routs.append(finplayer.out + binplayer.out) downs = [] for k in xrange(1, args.rlayers): # concatenate consecutive steps in the sequence (which are downscaled to half from the previous layer) d = Downscale(self.routs[-1], args.rnn_dim, suffix='ds%d' % k) downs.append(d) inp = d.out twocols = mask.T.reshape([-1, 2]) mask = T.or_(twocols[:, 0], twocols[:, 1]).reshape([mask.shape[1], -1]).T fseqmask = get_sequence_dropout_mask( (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop) bseqmask = get_sequence_dropout_mask( (inp.shape[0], inp.shape[1], args.rnn_dim), pdrop) flayer = GRULayer(Dropout(inp, pdrop).out, mask, fseqmask, args.rnn_dim, outputs_info, args, suffix='fenc%d' % k) blayer = GRULayer(Dropout(inp, pdrop).out, mask, bseqmask, args.rnn_dim, outputs_info, args, suffix='benc%d' % k, backwards=True) self.routs.append(flayer.out + blayer.out) flayers.append(flayer) blayers.append(blayer) self.hs = self.routs[-1] # for attention olayer = LayerWrapper(self.routs) rlayers = flayers + blayers # NOTE careful not to assume rlayers = # layers in all cases # undo the temporary hack super(BiPyrRNNEncoder, self).__init__(rlayers, olayer, downscales=downs)