def build_attention(tparams, options, desc, desc_mask, dlen, q, q_mask=None, sfx=None, name=None): if desc.ndim != desc_mask.ndim: desc_mask_ = desc_mask.dimshuffle(0, 1, 'x') assert desc.ndim == desc_mask_.ndim if q_mask is not None: assert q.ndim == q_mask.ndim q *= q_mask masked_desc = desc * desc_mask_ desc_in = desc.reshape((-1, desc.shape[-1])) projd = get_layer('ff')[1](tparams=tparams, state_below=desc_in, options=options, prefix='ff_att_ctx', activ='Linear') projq = get_layer('ff')[1](tparams, q, options, prefix='ff_att_q', use_bias=False, activ='Linear') """ Unnormalized dist metric between the rep of desc and q. """ sim_vals = 0 if options['use_dq_sims']: q_proj = dot(q, tparams['ff_att_bi_dq']) desc_proj = dot(masked_desc, tparams['ff_att_bi_dq']).reshape((masked_desc.shape[0], masked_desc.shape[1], -1)) sim_vals = (desc_proj * q_proj.dimshuffle('x', 0, 1)).sum(-1) sim_vals = sim_vals.dimshuffle(0, 1, 'x') projd = projd.reshape((masked_desc.shape[0], masked_desc.shape[1], -1)) #Intermediate layer for annotation values. proj_att = Tanh(projd + projq.dimshuffle('x', 0, 1) + sim_vals) W_proj = tparams['ff_att_proj'].dimshuffle('x', 'x', 0) dot_proj = (W_proj * proj_att).sum(-1) pre_softmax = dot_proj alphas = Masked_Softmax(pre_softmax, mask=desc_mask, ax=0).dimshuffle(0, 1, 'x') ctx = (masked_desc * alphas).sum(0) return ctx, alphas
def fflayer(tparams, state_below, options, prefix='rconv', use_bias=True, activ='lambda x: tensor.tanh(x)', **kwargs): if use_bias: return eval(activ)(dot(state_below, tparams[prfx(prefix, 'W')]) + tparams[prfx(prefix, 'b')]) else: return eval(activ)(dot(state_below, tparams[prfx(prefix, 'W')]))
def _step_slice(mask, sbelow, sbelowx, sbefore, U, Ux): preact = dot(sbefore, U) preact += sbelow r = Sigmoid(_slice(preact, 0, dim)) u = Sigmoid(_slice(preact, 1, dim)) preactx = dot(r * sbefore, Ux) # preactx = preactx preactx = preactx + sbelowx h = Tanh(preactx) h = u * sbefore + (1. - u) * h h = mask[:, None] * h + (1. - mask)[:, None] * sbefore return h
def build_bidir_model(inp, inp_mask, tparams, options, sfx=None, nsteps=None, use_dropout=False, use_noise=None, truncate=None, name=None): if use_dropout: assert use_noise is not None assert name is not None assert sfx is not None #inpr = inp[::-1] inpr_mask = inp_mask[::-1] n_timesteps = inp.shape[0] n_samples = inp.shape[1] emb = dot(inp, tparams['Wemb_%s' % sfx]) emb = emb.reshape([n_timesteps, n_samples, -1]) if use_dropout: emb = dropout_layer(emb, use_noise, p=options['dropout_rate']) """ Forward RNN """ proj = get_layer(options[name])[1](tparams=tparams, state_below=emb, options=options, prefix=name, nsteps=nsteps, truncate=truncate, mask=inp_mask) """ Reverse RNN. """ #embr = dot(inpr, tparams['Wemb_%s' % sfx]) embr = emb[::-1]#embr.reshape([n_timesteps, n_samples, -1]) projr = get_layer(options[name])[1](tparams=tparams, state_below=embr, options=options, prefix=name + "_r", nsteps=nsteps, truncate=truncate, mask=inpr_mask) return proj, projr
def _step(mask, sbelow, sbefore, cell_before): preact = dot(sbefore, param('U')) preact += sbelow preact += tparams[prfx(prefix, 'b')] f = Sigmoid(_slice(preact, 0, dim)) o = Sigmoid(_slice(preact, 1, dim)) c = Tanh(_slice(preact, 2, dim)) c = f * cell_before + (1 - f) * c c = mask * c + (1. - mask) * cell_before h = o * tensor.tanh(c) h = mask * h + (1. - mask) * sbefore return h, c
def _step(mask, sbelow, sbefore, cell_before, *args): preact = dot(sbefore, param('U')) preact += sbelow preact += param('b') i = Sigmoid(_slice(preact, 0, dim)) f = Sigmoid(_slice(preact, 1, dim)) o = Sigmoid(_slice(preact, 2, dim)) c = Tanh(_slice(preact, 3, dim)) c = f * cell_before + i * c c = mask * c + (1. - mask) * cell_before h = o * tensor.tanh(c) h = mask * h + (1. - mask) * sbefore return h, c
def _step_slice(mask, sbelow, sbelowx, xc_, sbefore, ctx_, alpha_, pctx_, cc_, U, Wc, Wd_att, U_att, c_tt, Ux, Wcx): # attention pstate_ = dot(sbefore, Wd_att) pctx__ = pctx_ + pstate_[None, :, :] pctx__ += xc_ pctx__ = Tanh(pctx__) alpha = dot(pctx__, U_att)+c_tt alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) alpha = tensor.exp(alpha) if context_mask: alpha = alpha * context_mask alpha = alpha / alpha.sum(0, keepdims=True) ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context preact = dot(sbefore, U) preact += sbelow preact += dot(ctx_, Wc) preact = Sigmoid(preact) r = _slice(preact, 0, dim) u = _slice(preact, 1, dim) preactx = dot(sbefore, Ux) preactx *= r preactx += sbelowx preactx += dot(ctx_, Wcx) h = Tanh(preactx) h = u * sbefore + (1. - u) * h h = mask[:, None] * h + (1. - mask)[:, None] * sbefore return h, ctx_, alpha.T
def gru_layer(tparams, state_below, options, prefix='gru', mask=None, nsteps=None, truncate=None, init_state=None, **kwargs): if nsteps is None: nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 param = lambda name: tparams[prfx(prefix, name)] dim = param('Ux').shape[1] if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) if mask.ndim == 3 and mask.ndim == state_below.ndim: mask = mask.reshape((mask.shape[0], \ mask.shape[1] * mask.shape[2])).dimshuffle(0, 1, 'x') elif mask.ndim == 2: mask = mask.dimshuffle(0, 1, 'x') def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] state_below_ = dot(state_below, param('W')) + param('b') state_belowx = dot(state_below, param('Wx')) + param('bx') # initial/previous state if init_state is None: if not options['learn_h0']: init_state = tensor.alloc(0., n_samples, dim) else: init_state0 = sharedX(numpy.zeros((options['dim'])), name=prfx(prefix, "h0")) init_state = tensor.concatenate([[init_state0] \ for i in xrange(options['batch_size'])], axis=0) tparams[prfx(prefix, 'h0')] = init_state0 U = tparams[prfx(prefix, 'U')] Ux = tparams[prfx(prefix, 'Ux')] def _step_slice(mask, sbelow, sbelowx, sbefore, U, Ux): preact = dot(sbefore, U) preact += sbelow r = Sigmoid(_slice(preact, 0, dim)) u = Sigmoid(_slice(preact, 1, dim)) preactx = dot(r * sbefore, Ux) # preactx = preactx preactx = preactx + sbelowx h = Tanh(preactx) h = u * sbefore + (1. - u) * h h = mask[:, None] * h + (1. - mask)[:, None] * sbefore return h seqs = [mask, state_below_, state_belowx] _step = _step_slice rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[init_state], non_sequences=[U, Ux], name=prfx(prefix, '_layers'), n_steps=nsteps, truncate_gradient=truncate, profile=profile, strict=True) rval = [rval] return rval
def gru_cond_layer(tparams, state_below, options, prefix='gru', mask=None, context=None, one_step=False, init_memory=None, init_state=None, context_mask=None, nsteps=None, **kwargs): assert context, 'Context must be provided' if one_step: assert init_state, 'previous state must be provided' if nsteps is None: nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 # mask if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) dim = tparams[prfx(prefix, 'Wcx')].shape[1] # initial/previous state if init_state is None: init_state = tensor.alloc(0., n_samples, dim) # projected context assert context.ndim == 3, 'Context must be 3-d: #annotation x #sample x dim' pctx_ = dot(context, tparams[prfx(prefix, 'Wc_att')]) + tparams[prfx(prefix, 'b_att')] def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] # projected x state_belowx = dot(state_below, tparams[prfx(prefix, 'Wx')]) + \ tparams[prfx(prefix, 'bx')] state_below_ = dot(state_below, tparams[prfx(prefix, 'W')]) + \ tparams[prfx(prefix, 'b')] state_belowc = dot(state_below, tparams[prfx(prefix, 'Wi_att')]) def _step_slice(mask, sbelow, sbelowx, xc_, sbefore, ctx_, alpha_, pctx_, cc_, U, Wc, Wd_att, U_att, c_tt, Ux, Wcx): # attention pstate_ = dot(sbefore, Wd_att) pctx__ = pctx_ + pstate_[None, :, :] pctx__ += xc_ pctx__ = Tanh(pctx__) alpha = dot(pctx__, U_att)+c_tt alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) alpha = tensor.exp(alpha) if context_mask: alpha = alpha * context_mask alpha = alpha / alpha.sum(0, keepdims=True) ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context preact = dot(sbefore, U) preact += sbelow preact += dot(ctx_, Wc) preact = Sigmoid(preact) r = _slice(preact, 0, dim) u = _slice(preact, 1, dim) preactx = dot(sbefore, Ux) preactx *= r preactx += sbelowx preactx += dot(ctx_, Wcx) h = Tanh(preactx) h = u * sbefore + (1. - u) * h h = mask[:, None] * h + (1. - mask)[:, None] * sbefore return h, ctx_, alpha.T seqs = [mask, state_below_, state_belowx, state_belowc] _step = _step_slice shared_vars = [tparams[prfx(prefix, 'U')], tparams[prfx(prefix, 'Wc')], tparams[prfx(prefix, 'Wd_att')], tparams[prfx(prefix, 'U_att')], tparams[prfx(prefix, 'c_tt')], tparams[prfx(prefix, 'Ux')], tparams[prfx(prefix, 'Wcx')]] if one_step: rval = _step(*(seqs+[init_state, None, None, pctx_, context]+shared_vars)) else: rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[init_state, tensor.alloc(0., n_samples, context.shape[2]), tensor.alloc(0., n_samples, context.shape[0])], non_sequences=[pctx_, context]+shared_vars, name=prfx(prefix, '_layers'), n_steps=nsteps, profile=profile, strict=True) return rval
def lstm_tied_layer(tparams, state_below, options, prefix='lstm_tied', mask=None, one_step=False, init_state=None, init_memory=None, nsteps=None, **kwargs): if nsteps is None: nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 param = lambda name: tparams[prfx(prefix, name)] dim = param('U').shape[0] if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) # initial/previous state if init_state is None: if not options['learn_h0']: init_state = tensor.alloc(0., n_samples, dim) else: init_state0 = sharedX(numpy.zeros((options['dim'])), name=prfx(prefix, "h0")) init_state = tensor.concatenate([[init_state0] \ for i in xrange(options['batch_size'])], axis=0) tparams[prfx(prefix, 'h0')] = init_state0 # initial/previous memory if init_memory is None: init_memory = tensor.alloc(0., n_samples, dim) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] def _step(mask, sbelow, sbefore, cell_before): preact = dot(sbefore, param('U')) preact += sbelow preact += tparams[prfx(prefix, 'b')] f = Sigmoid(_slice(preact, 0, dim)) o = Sigmoid(_slice(preact, 1, dim)) c = Tanh(_slice(preact, 2, dim)) c = f * cell_before + (1 - f) * c c = mask * c + (1. - mask) * cell_before h = o * tensor.tanh(c) h = mask * h + (1. - mask) * sbefore return h, c state_below = dot(state_below, param('W')) + param('b') if one_step: mask = mask.dimshuffle(0, 'x') h, c = _step(mask, state_below, init_state, init_memory) rval = [h, c] else: if mask.ndim == 3 and mask.ndim == state_below.ndim: mask = mask.reshape((mask.shape[0], mask.shape[1]*mask.shape[2])).dimshuffle(0, 1, 'x') elif mask.ndim == 2: mask = mask.dimshuffle(0, 1, 'x') rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[init_state, init_memory], name=prfx(prefix, '_layers'), n_steps=nsteps) return rval