def param_init_lstm(options, params, prefix='lstm', nin=None, dim=None): if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] """ Stack the weight matricies for all the gates for much cleaner code and slightly faster dot-prods """ # input weights W = numpy.concatenate([ norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim) ], axis=1) params[_p(prefix, 'W')] = W # for the previous hidden activation U = numpy.concatenate([ ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim) ], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = numpy.zeros((4 * dim, )).astype('float32') return params
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True, flag=False): if nin is None: nin = options['dim_proj'] if nout is None: nout = options['dim_proj'] params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) flag = False if flag: #params[_p(prefix, 'b')] = np.full(nout,-1).astype('float32') import gzip import pickle with gzip.open('mnist.pkl.gz', 'rb') as f: train_set, _, _ = pickle.load(f) train_x, train_y = train_set marginals = np.clip(train_x.mean(axis=0), 1e-7, 1 - 1e-7) initial_baises = np.log(marginals / (1 - marginals)) params[_p(prefix, 'b')] = initial_baises.astype('float32') else: params[_p(prefix, 'b')] = np.zeros((nout, )).astype('float32') return params
def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs): return eval(activ)(tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')])
def param_init_attention(options, params, prefix='attention'): dim_word = options['dim_word'] params[_p(prefix, 'Wm')] = norm_weight(dim_word) params[_p(prefix, 'b')] = numpy.zeros((dim_word, ), dtype='float32') params[_p(prefix, 'W_att')] = norm_weight(dim_word) params[_p(prefix, 'U_att')] = norm_weight(dim_word, 1) params[_p(prefix, 'c_att')] = numpy.zeros((1, ), dtype='float32') return params
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None, **kwargs): nsteps = state_below.shape[0] dim = tparams[_p(prefix, 'U')].shape[0] # if we are dealing with a mini-batch if state_below.ndim == 3: n_samples = state_below.shape[1] init_state = tensor.alloc(0., n_samples, dim) init_memory = tensor.alloc(0., n_samples, dim) # during sampling else: n_samples = 1 init_state = tensor.alloc(0., dim) init_memory = tensor.alloc(0., dim) # if we have no mask, we assume all the inputs are valid if mask == None: mask = tensor.alloc(1., state_below.shape[0], 1) # use the slice to calculate all the different gates def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] elif _x.ndim == 2: return _x[:, n * dim:(n + 1) * dim] return _x[n * dim:(n + 1) * dim] # one time step of the lstm def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, dim)) f = tensor.nnet.sigmoid(_slice(preact, 1, dim)) o = tensor.nnet.sigmoid(_slice(preact, 2, dim)) c = tensor.tanh(_slice(preact, 3, dim)) c = f * c_ + i * c h = o * tensor.tanh(c) return h, c, i, f, o, preact state_below = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] rval, updates = theano.scan( _step, sequences=[mask, state_below], outputs_info=[init_state, init_memory, None, None, None, None], name=_p(prefix, '_layers'), n_steps=nsteps, profile=False) return rval
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None): if nin is None: nin = options['dim_proj'] if nout is None: nout = options['dim_proj'] params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01) params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32') return params
def param_init_fflayer(options, params, prefix='ff', prefix_bnorm='bnorm', nin=None, nout=None, ortho=True, batch_norm=False): if prefix in params: print 'this layer is already present' else: params[_p(prefix, 'W')] = norm_weight(nin, nout) params[_p(prefix, 'b')] = np.zeros((nout,)).astype('float32') return params
def bnorm_layer_init(input_shape, params, prefix): c1_b = BatchNormalization() c1_b.build(input_shape) qw = c1_b.get_weights() params[_p(prefix, 'gamma')] = qw[0] params[_p(prefix, 'beta')] = qw[1] params[_p(prefix, 'run_mean')] = qw[2] params[_p(prefix, 'run_std')] = qw[3] return params
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None): if nin is None: nin = options['dim_proj'] if nout is None: nout = options['dim_proj'] params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01) params[_p(prefix, 'b')] = numpy.zeros((nout, )).astype('float32') return params
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True, flag=False): params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) params[_p(prefix, 'b')] = np.zeros((nout, )).astype('float32') return params
def mlp_attention_layer(tparams, state_below, options, prefix='attention'): mean_emb = state_below.mean(1) attention_vec = tensor.dot(state_below, tparams[_p( prefix, 'W_att')]) + tparams[_p(prefix, 'b')] attention_vec += tensor.dot(mean_emb, tparams[_p(prefix, 'Wm')])[:, None, :] attention_vec = tanh(attention_vec) alpha = tensor.dot(attention_vec, tparams[_p( prefix, 'U_att')]) + tparams[_p(prefix, 'c_att')] alpha_shp = alpha.shape alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0], alpha_shp[1]])) output = (state_below * alpha[:, :, None]).sum(1) return output
def mlp_layer(tparams, state_below, options, prefix='predictor'): layer_num = len(options['dims']) for i in range(layer_num - 1): if i == 0: output = tensor.dot(state_below, tparams[_p(prefix, i)]) output = tanh(output) elif i == layer_num - 2: output = tensor.dot(output, tparams[_p(prefix, i)]) output = rectifier(output) else: output = tensor.dot(output, tparams[_p(prefix, i)]) output = tanh(output) return output
def param_init_convlayer(options, params, prefix='ff', nin=None, nout=None, kernel_len=5, ortho=True, batch_norm=False): params[_p(prefix, 'W')] = 0.01 * np_rng.normal(size=(nout, nin, kernel_len, kernel_len)).astype('float32') params[_p(prefix, 'b')] = np.zeros(shape=(nout, )).astype('float32') return params
def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None): """ Each variable is one time slice of the LSTM m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory), a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context), pctx_ (projected context), dp_/dp_att_ (dropout masks) """ # attention computation # [described in equations (4), (5), (6) in # section "3.1.2 Decoder: Long Short Term Memory Network] pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')]) + tensor.dot(ct_, tparams[_p(prefix, 'Wct_att')]) pctx_ = pctx_ + pstate_[:,None,:] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax ctx_ = (context * alpha[:,:,None]).sum(1) # current context alpha_sample = alpha # you can return something else reasonable here to debug preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) # Recover the activations to the lstm gates # [equation (1)] i = _slice(preact, 0, dim) f = _slice(preact, 1, dim) o = _slice(preact, 2, dim) if options['use_dropout_lstm']: i = i * _slice(dp_, 0, dim) f = f * _slice(dp_, 1, dim) o = o * _slice(dp_, 2, dim) i = tensor.nnet.sigmoid(i) f = tensor.nnet.sigmoid(f) o = tensor.nnet.sigmoid(o) c = tensor.tanh(_slice(preact, 3, dim)) # compute the new memory/hidden state # if the mask is 0, just copy the previous state c = f * c_ + i * c c = m_[:,None] * c + (1. - m_)[:,None] * c_ h = o * tensor.tanh(c) h = m_[:,None] * h + (1. - m_)[:,None] * h_ rval = [h, c, alpha, alpha_sample, ctx_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list return rval
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None, **kwargs): nsteps = state_below.shape[0] dim = tparams[_p(prefix,'U')].shape[0] # if we are dealing with a mini-batch if state_below.ndim == 3: n_samples = state_below.shape[1] init_state = tensor.alloc(0., n_samples, dim) init_memory = tensor.alloc(0., n_samples, dim) # during sampling else: n_samples = 1 init_state = tensor.alloc(0., dim) init_memory = tensor.alloc(0., dim) # if we have no mask, we assume all the inputs are valid if mask == None: mask = tensor.alloc(1., state_below.shape[0], 1) # use the slice to calculate all the different gates def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] elif _x.ndim == 2: return _x[:, n*dim:(n+1)*dim] return _x[n*dim:(n+1)*dim] # one time step of the lstm def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, dim)) f = tensor.nnet.sigmoid(_slice(preact, 1, dim)) o = tensor.nnet.sigmoid(_slice(preact, 2, dim)) c = tensor.tanh(_slice(preact, 3, dim)) c = f * c_ + i * c h = o * tensor.tanh(c) return h, c, i, f, o, preact state_below = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[init_state, init_memory, None, None, None, None], name=_p(prefix, '_layers'), n_steps=nsteps, profile=False) return rval
def fflayer(tparams, state_below, options, index, prefix='rconv', prefix_bnorm='bnorm', activ='lambda x: tensor.tanh(x)', batch_norm = False, **kwargs): preactivation = T.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] if batch_norm: preactivation = (preactivation - preactivation.mean(axis=0)) / (0.0001 + preactivation.std(axis=0)) preactivation = (tparams[_p(prefix_bnorm, 'newmu')][index] + preactivation* tparams[_p(prefix_bnorm, 'newsigma')][index]) return preactivation
def param_init_mlp(options, params, prefix='predictor'): dims = options['dims'] layer_num = len(dims) assert layer_num >= 3 for i in range(layer_num - 1): W = norm_weight(dims[i], dims[i + 1]) params[_p(prefix, i)] = W return params
def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, dim)) f = tensor.nnet.sigmoid(_slice(preact, 1, dim)) o = tensor.nnet.sigmoid(_slice(preact, 2, dim)) c = tensor.tanh(_slice(preact, 3, dim)) c = f * c_ + i * c h = o * tensor.tanh(c) return h, c, i, f, o, preact
def param_init_lstm(options, params, prefix='lstm', nin=None, dim=None): if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] """ Stack the weight matricies for all the gates for much cleaner code and slightly faster dot-prods """ # input weights W = numpy.concatenate([norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix,'W')] = W # for the previous hidden activation U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix,'U')] = U params[_p(prefix,'b')] = numpy.zeros((4 * dim,)).astype('float32') return params
def init_params(options): params = OrderedDict() if use_conv: bn = True params = ConvLayer(3, 64, 5, 2, params=params, prefix='conv_1', bn=bn) params = ConvLayer(64, 128, 5, 2, params=params, prefix='conv_2', bn=bn) params = ConvLayer(128, 256, 5, 2, params=params, prefix='conv_3', bn=bn) ''' params = get_layer('ff')[0](options, params, prefix='layer_1',nin=4*4*256, nout=2048,ortho=False) params = get_layer('ff')[0](options, params, prefix='layer_2',nin=2048, nout=2048,ortho=False) params = get_layer('ff')[0](options, params, prefix='layer_3',nin=2048, nout=2048,ortho=False) params = get_layer('ff')[0](options, params, prefix='layer_4',nin=2048, nout=2048,ortho=False) params = get_layer('ff')[0](options, params, prefix='layer_5',nin=2048, nout=4*4*256,ortho=False) ''' ''' params = param_init_convlayer(options, params, prefix='conv_1', nin=3, nout=64, kernel_len=5, batch_norm=bn) params[_p('conv_1', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 64)).astype('float32') params[_p('conv_1', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 64)).astype('float32') params = param_init_convlayer(options, params, prefix='conv_2', nin=64, nout=128, kernel_len=5, batch_norm=bn) params[_p('conv_2', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 128)).astype('float32') params[_p('conv_2', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 128)).astype('float32') params = param_init_convlayer(options, params, prefix='conv_3', nin=128, nout=256, kernel_len=5, batch_norm=bn) params[_p('conv_3', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 256)).astype('float32') params[_p('conv_3', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 256)).astype('float32') ''' params = get_layer('ff')[0](options, params, prefix='layer_1', prefix_bnorm='layer_1_step_0', nin=4 * 4 * 256, nout=2048, ortho=False, batch_norm=True) params = get_layer('ff')[0](options, params, prefix='layer_2', prefix_bnorm='layer_2_step_0', nin=2048, nout=2048, ortho=False, batch_norm=True) params = get_layer('ff')[0](options, params, prefix='layer_3', prefix_bnorm='layer_3_step_0', nin=2048, nout=2048, ortho=False, batch_norm=True) params = get_layer('ff')[0](options, params, prefix='layer_4', prefix_bnorm='layer_4_step_0', nin=2048, nout=2048, ortho=False, batch_norm=True) params = get_layer('ff')[0](options, params, prefix='layer_5', prefix_bnorm='layer_5_step_0', nin=2048, nout=4 * 4 * 256, ortho=False, batch_norm=True) params[_p('layer1_bnorm', 'newmu')] = np.zeros(shape=(args.num_steps * args.meta_steps, 2048)).astype('float32') params[_p('layer1_bnorm', 'newsigma')] = np.ones( shape=(args.num_steps * args.meta_steps, 2048)).astype('float32') params[_p('layer2_bnorm', 'newmu')] = np.zeros(shape=(args.num_steps * args.meta_steps, 2048)).astype('float32') params[_p('layer2_bnorm', 'newsigma')] = np.ones( shape=(args.num_steps * args.meta_steps, 2048)).astype('float32') params[_p('layer3_bnorm', 'newmu')] = np.zeros(shape=(args.num_steps * args.meta_steps, 2048)).astype('float32') params[_p('layer3_bnorm', 'newsigma')] = np.ones( shape=(args.num_steps * args.meta_steps, 2048)).astype('float32') params[_p('layer4_bnorm', 'newmu')] = np.zeros(shape=(args.num_steps * args.meta_steps, 2048)).astype('float32') params[_p('layer4_bnorm', 'newsigma')] = np.ones( shape=(args.num_steps * args.meta_steps, 2048)).astype('float32') params[_p('layer5_bnorm', 'newmu')] = np.zeros(shape=(args.num_steps * args.meta_steps, 4 * 4 * 256)).astype('float32') params[_p('layer5_bnorm', 'newsigma')] = np.ones(shape=(args.num_steps * args.meta_steps, 4 * 4 * 256)).astype('float32') params = ConvLayer(256, 128, 5, -2, params=params, prefix='conv_4_mu', bn=bn) params = ConvLayer(128, 64, 5, -2, params=params, prefix='conv_5_mu', bn=bn) params = ConvLayer(64, 3, 5, -2, params=params, prefix='conv_6_mu') params = ConvLayer(256, 128, 5, -2, params=params, prefix='conv_4_s', bn=bn) params = ConvLayer(128, 64, 5, -2, params=params, prefix='conv_5_s', bn=bn) params = ConvLayer(64, 3, 5, -2, params=params, prefix='conv_6_s') ''' params = param_init_convlayer(options, params, prefix='conv_4_mu', nin=256, nout=128, kernel_len=5, batch_norm=bn) params[_p('conv_4_mu', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 128)).astype('float32') params[_p('conv_4_mu', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 128)).astype('float32') params = param_init_convlayer(options, params, prefix='conv_5_mu', nin=128, nout=64, kernel_len=5, batch_norm=bn) params[_p('conv_5_mu', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 64)).astype('float32') params[_p('conv_5_mu', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 64)).astype('float32') params = param_init_convlayer(options, params, prefix='conv_6_mu', nin=64, nout=3, kernel_len=5, batch_norm =False) params = param_init_convlayer(options, params, prefix='conv_4_s', nin=256, nout=128, kernel_len=5, batch_norm=bn) params[_p('conv_4_s', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 128)).astype('float32') params[_p('conv_4_s', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 128)).astype('float32') params = param_init_convlayer(options, params, prefix='conv_5_s', nin=128, nout=64, kernel_len=5, batch_norm=bn) params[_p('conv_5_s', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 64)).astype('float32') params[_p('conv_5_s', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 64)).astype('float32') params = param_init_convlayer(options, params, prefix='conv_6_s', nin=64, nout=3, kernel_len=5, batch_norm = False) ''' return params
def lstm_cond_layer(tparams, state_below, options, prefix='lstm', mask=None, context=None, one_step=False, init_memory=None, init_state=None, trng=None, use_noise=None, sampling=True, argmax=False, **kwargs): assert context, 'Context must be provided' if one_step: assert init_memory, 'previous memory must be provided' assert init_state, 'previous state must be provided' nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 # mask if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) # infer lstm dimension dim = tparams[_p(prefix, 'U')].shape[0] # initial/previous state if init_state is None: init_state = tensor.alloc(0., n_samples, dim) # initial/previous memory if init_memory is None: init_memory = tensor.alloc(0., n_samples, dim) # projected context pctx_ = tensor.dot(context, tparams[_p(prefix,'Wc_att')]) + tparams[_p(prefix, 'b_att')] if options['n_layers_att'] > 1: for lidx in xrange(1, options['n_layers_att']): pctx_ = tensor.dot(pctx_, tparams[_p(prefix,'W_att_%d'%lidx)])+tparams[_p(prefix, 'b_att_%d'%lidx)] # note to self: this used to be options['n_layers_att'] - 1, so no extra non-linearity if n_layers_att < 3 if lidx < options['n_layers_att']: pctx_ = tanh(pctx_) # projected x # state_below is timesteps*num samples by d in training (TODO change to notation of paper) # this is n * d during sampling state_below = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] # additional parameters for stochastic hard attention if options['attn_type'] == 'stochastic': # temperature for softmax temperature = options.get("temperature", 1) # [see (Section 4.1): Stochastic "Hard" Attention] semi_sampling_p = options.get("semi_sampling_p", 0.5) temperature_c = theano.shared(numpy.float32(temperature), name='temperature_c') h_sampling_mask = trng.binomial((1,), p=semi_sampling_p, n=1, dtype=theano.config.floatX).sum() def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None): """ Each variable is one time slice of the LSTM m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory), a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context), pctx_ (projected context), dp_/dp_att_ (dropout masks) """ # attention computation # [described in equations (4), (5), (6) in # section "3.1.2 Decoder: Long Short Term Memory Network] pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')]) pctx_ = pctx_ + pstate_[:,None,:] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape if options['attn_type'] == 'deterministic': alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax ctx_ = (context * alpha[:,:,None]).sum(1) # current context alpha_sample = alpha # you can return something else reasonable here to debug else: alpha = tensor.nnet.softmax(temperature_c*alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax # TODO return alpha_sample if sampling: alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\ + (1.-h_sampling_mask) * alpha else: if argmax: alpha_sample = tensor.cast(tensor.eq(tensor.arange(alpha_shp[1])[None,:], tensor.argmax(alpha,axis=1,keepdims=True)), theano.config.floatX) else: alpha_sample = alpha ctx_ = (context * alpha_sample[:,:,None]).sum(1) # current context if options['selector']: sel_ = tensor.nnet.sigmoid(tensor.dot(h_, tparams[_p(prefix, 'W_sel')])+tparams[_p(prefix,'b_sel')]) sel_ = sel_.reshape([sel_.shape[0]]) ctx_ = sel_[:,None] * ctx_ preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) # Recover the activations to the lstm gates # [equation (1)] i = _slice(preact, 0, dim) f = _slice(preact, 1, dim) o = _slice(preact, 2, dim) if options['use_dropout_lstm']: i = i * _slice(dp_, 0, dim) f = f * _slice(dp_, 1, dim) o = o * _slice(dp_, 2, dim) i = tensor.nnet.sigmoid(i) f = tensor.nnet.sigmoid(f) o = tensor.nnet.sigmoid(o) c = tensor.tanh(_slice(preact, 3, dim)) # compute the new memory/hidden state # if the mask is 0, just copy the previous state c = f * c_ + i * c c = m_[:,None] * c + (1. - m_)[:,None] * c_ h = o * tensor.tanh(c) h = m_[:,None] * h + (1. - m_)[:,None] * h_ rval = [h, c, alpha, alpha_sample, ctx_] if options['selector']: rval += [sel_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list return rval if options['use_dropout_lstm']: if options['selector']: _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, sel_, pctx_: \ _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_) else: _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, pctx_: \ _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_) dp_shape = state_below.shape if one_step: dp_mask = tensor.switch(use_noise, trng.binomial((dp_shape[0], 3*dim), p=0.5, n=1, dtype=state_below.dtype), tensor.alloc(0.5, dp_shape[0], 3 * dim)) else: dp_mask = tensor.switch(use_noise, trng.binomial((dp_shape[0], dp_shape[1], 3*dim), p=0.5, n=1, dtype=state_below.dtype), tensor.alloc(0.5, dp_shape[0], dp_shape[1], 3*dim)) else: if options['selector']: _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, sel_, pctx_: _step(m_, x_, h_, c_, a_, as_, ct_, pctx_) else: _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, pctx_: _step(m_, x_, h_, c_, a_, as_, ct_, pctx_) if one_step: if options['use_dropout_lstm']: if options['selector']: rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, None, pctx_) else: rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, pctx_) else: if options['selector']: rval = _step0(mask, state_below, init_state, init_memory, None, None, None, None, pctx_) else: rval = _step0(mask, state_below, init_state, init_memory, None, None, None, pctx_) return rval else: seqs = [mask, state_below] if options['use_dropout_lstm']: seqs += [dp_mask] outputs_info = [init_state, init_memory, tensor.alloc(0., n_samples, pctx_.shape[1]), tensor.alloc(0., n_samples, pctx_.shape[1]), tensor.alloc(0., n_samples, context.shape[2])] if options['selector']: outputs_info += [tensor.alloc(0., n_samples)] outputs_info += [None, None, None, None, None, None, None] + [None] # *options['n_layers_att'] rval, updates = theano.scan(_step0, sequences=seqs, outputs_info=outputs_info, non_sequences=[pctx_], name=_p(prefix, '_layers'), n_steps=nsteps, profile=False) return rval, updates
def param_init_lstm_cond(options, params, prefix='lstm_cond', nin=None, dim=None, dimctx=None): if nin is None: nin = options['dim'] if dim is None: dim = options['dim'] if dimctx is None: dimctx = options['dim'] # input to LSTM, similar to the above, we stack the matricies for compactness, do one # dot product, and use the slice function below to get the activations for each "gate" W = numpy.concatenate([ norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim) ], axis=1) params[_p(prefix, 'W')] = W # LSTM to LSTM U = numpy.concatenate([ ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim) ], axis=1) params[_p(prefix, 'U')] = U # bias to LSTM params[_p(prefix, 'b')] = numpy.zeros((4 * dim, )).astype('float32') # context to LSTM Wc = norm_weight(dimctx, dim * 4) params[_p(prefix, 'Wc')] = Wc # attention: context -> hidden Wc_att = norm_weight(dimctx, ortho=False) params[_p(prefix, 'Wc_att')] = Wc_att # attention: LSTM -> hidden Wd_att = norm_weight(dim, dimctx) params[_p(prefix, 'Wd_att')] = Wd_att # attention: hidden bias b_att = numpy.zeros((dimctx, )).astype('float32') params[_p(prefix, 'b_att')] = b_att # optional "deep" attention if options['n_layers_att'] > 1: for lidx in xrange(1, options['n_layers_att']): params[_p(prefix, 'W_att_%d' % lidx)] = ortho_weight(dimctx) params[_p(prefix, 'b_att_%d' % lidx)] = numpy.zeros( (dimctx, )).astype('float32') # attention: U_att = norm_weight(dimctx, 1) params[_p(prefix, 'U_att')] = U_att c_att = numpy.zeros((1, )).astype('float32') params[_p(prefix, 'c_tt')] = c_att if options['selector']: # attention: selector W_sel = norm_weight(dim, 1) params[_p(prefix, 'W_sel')] = W_sel b_sel = numpy.float32(0.) params[_p(prefix, 'b_sel')] = b_sel return params
def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None): """ Each variable is one time slice of the LSTM m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory), a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context), pctx_ (projected context), dp_/dp_att_ (dropout masks) """ # attention computation # [described in equations (4), (5), (6) in # section "3.1.2 Decoder: Long Short Term Memory Network] pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')]) pctx_ = pctx_ + pstate_[:,None,:] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape if options['attn_type'] == 'deterministic': alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax ctx_ = (context * alpha[:,:,None]).sum(1) # current context alpha_sample = alpha # you can return something else reasonable here to debug else: alpha = tensor.nnet.softmax(temperature_c*alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax # TODO return alpha_sample if sampling: alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\ + (1.-h_sampling_mask) * alpha else: if argmax: alpha_sample = tensor.cast(tensor.eq(tensor.arange(alpha_shp[1])[None,:], tensor.argmax(alpha,axis=1,keepdims=True)), theano.config.floatX) else: alpha_sample = alpha ctx_ = (context * alpha_sample[:,:,None]).sum(1) # current context if options['selector']: sel_ = tensor.nnet.sigmoid(tensor.dot(h_, tparams[_p(prefix, 'W_sel')])+tparams[_p(prefix,'b_sel')]) sel_ = sel_.reshape([sel_.shape[0]]) ctx_ = sel_[:,None] * ctx_ preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) # Recover the activations to the lstm gates # [equation (1)] i = _slice(preact, 0, dim) f = _slice(preact, 1, dim) o = _slice(preact, 2, dim) if options['use_dropout_lstm']: i = i * _slice(dp_, 0, dim) f = f * _slice(dp_, 1, dim) o = o * _slice(dp_, 2, dim) i = tensor.nnet.sigmoid(i) f = tensor.nnet.sigmoid(f) o = tensor.nnet.sigmoid(o) c = tensor.tanh(_slice(preact, 3, dim)) # compute the new memory/hidden state # if the mask is 0, just copy the previous state c = f * c_ + i * c c = m_[:,None] * c + (1. - m_)[:,None] * c_ h = o * tensor.tanh(c) h = m_[:,None] * h + (1. - m_)[:,None] * h_ rval = [h, c, alpha, alpha_sample, ctx_] if options['selector']: rval += [sel_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list return rval
def lstm_cond_layer(tparams, state_below, options, prefix='lstm', mask=None, context=None, one_step=False, init_memory=None, init_state=None, trng=None, use_noise=None, sampling=True, argmax=False, **kwargs): assert context, 'Context must be provided' if one_step: assert init_memory, 'previous memory must be provided' assert init_state, 'previous state must be provided' nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 # mask if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) # infer lstm dimension dim = tparams[_p(prefix, 'U')].shape[0] # initial/previous state if init_state is None: init_state = tensor.alloc(0., n_samples, dim) # initial/previous memory if init_memory is None: init_memory = tensor.alloc(0., n_samples, dim) # projected context pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) + tparams[_p( prefix, 'b_att')] if options['n_layers_att'] > 1: for lidx in xrange(1, options['n_layers_att']): pctx_ = tensor.dot(pctx_, tparams[_p( prefix, 'W_att_%d' % lidx)]) + tparams[_p( prefix, 'b_att_%d' % lidx)] # note to self: this used to be options['n_layers_att'] - 1, so no extra non-linearity if n_layers_att < 3 if lidx < options['n_layers_att']: pctx_ = tanh(pctx_) # projected x # state_below is timesteps*num samples by d in training (TODO change to notation of paper) # this is n * d during sampling state_below = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] # additional parameters for stochastic hard attention if options['attn_type'] == 'stochastic': # temperature for softmax temperature = options.get("temperature", 1) # [see (Section 4.1): Stochastic "Hard" Attention] semi_sampling_p = options.get("semi_sampling_p", 0.5) temperature_c = theano.shared(numpy.float32(temperature), name='temperature_c') h_sampling_mask = trng.binomial((1, ), p=semi_sampling_p, n=1, dtype=theano.config.floatX).sum() def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None): """ Each variable is one time slice of the LSTM m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory), a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context), pctx_ (projected context), dp_/dp_att_ (dropout masks) """ # attention computation # [described in equations (4), (5), (6) in # section "3.1.2 Decoder: Long Short Term Memory Network] pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')]) pctx_ = pctx_ + pstate_[:, None, :] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix, 'U_att')]) + tparams[_p( prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape if options['attn_type'] == 'deterministic': alpha = tensor.nnet.softmax( alpha.reshape([alpha_shp[0], alpha_shp[1]])) # softmax ctx_ = (context * alpha[:, :, None]).sum(1) # current context alpha_sample = alpha # you can return something else reasonable here to debug else: alpha = tensor.nnet.softmax( temperature_c * alpha.reshape([alpha_shp[0], alpha_shp[1]])) # softmax # TODO return alpha_sample if sampling: alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\ + (1.-h_sampling_mask) * alpha else: if argmax: alpha_sample = tensor.cast( tensor.eq( tensor.arange(alpha_shp[1])[None, :], tensor.argmax(alpha, axis=1, keepdims=True)), theano.config.floatX) else: alpha_sample = alpha ctx_ = (context * alpha_sample[:, :, None]).sum( 1) # current context if options['selector']: sel_ = tensor.nnet.sigmoid( tensor.dot(h_, tparams[_p(prefix, 'W_sel')]) + tparams[_p(prefix, 'b_sel')]) sel_ = sel_.reshape([sel_.shape[0]]) ctx_ = sel_[:, None] * ctx_ preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) # Recover the activations to the lstm gates # [equation (1)] i = _slice(preact, 0, dim) f = _slice(preact, 1, dim) o = _slice(preact, 2, dim) if options['use_dropout_lstm']: i = i * _slice(dp_, 0, dim) f = f * _slice(dp_, 1, dim) o = o * _slice(dp_, 2, dim) i = tensor.nnet.sigmoid(i) f = tensor.nnet.sigmoid(f) o = tensor.nnet.sigmoid(o) c = tensor.tanh(_slice(preact, 3, dim)) # compute the new memory/hidden state # if the mask is 0, just copy the previous state c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ rval = [h, c, alpha, alpha_sample, ctx_] if options['selector']: rval += [sel_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre] + pctx_list return rval if options['use_dropout_lstm']: if options['selector']: _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, sel_, pctx_: \ _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_) else: _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, pctx_: \ _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_) dp_shape = state_below.shape if one_step: dp_mask = tensor.switch( use_noise, trng.binomial((dp_shape[0], 3 * dim), p=0.5, n=1, dtype=state_below.dtype), tensor.alloc(0.5, dp_shape[0], 3 * dim)) else: dp_mask = tensor.switch( use_noise, trng.binomial((dp_shape[0], dp_shape[1], 3 * dim), p=0.5, n=1, dtype=state_below.dtype), tensor.alloc(0.5, dp_shape[0], dp_shape[1], 3 * dim)) else: if options['selector']: _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, sel_, pctx_: _step( m_, x_, h_, c_, a_, as_, ct_, pctx_) else: _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, pctx_: _step( m_, x_, h_, c_, a_, as_, ct_, pctx_) if one_step: if options['use_dropout_lstm']: if options['selector']: rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, None, pctx_) else: rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, pctx_) else: if options['selector']: rval = _step0(mask, state_below, init_state, init_memory, None, None, None, None, pctx_) else: rval = _step0(mask, state_below, init_state, init_memory, None, None, None, pctx_) return rval else: seqs = [mask, state_below] if options['use_dropout_lstm']: seqs += [dp_mask] outputs_info = [ init_state, init_memory, tensor.alloc(0., n_samples, pctx_.shape[1]), tensor.alloc(0., n_samples, pctx_.shape[1]), tensor.alloc(0., n_samples, context.shape[2]) ] if options['selector']: outputs_info += [tensor.alloc(0., n_samples)] outputs_info += [None, None, None, None, None, None, None ] + [None] # *options['n_layers_att'] rval, updates = theano.scan(_step0, sequences=seqs, outputs_info=outputs_info, non_sequences=[pctx_], name=_p(prefix, '_layers'), n_steps=nsteps, profile=False) return rval, updates
def init_params(options): params = OrderedDict() if not use_conv: params = get_layer('ff')[0](options, params, prefix='layer_1', nin=INPUT_SIZE, nout=args.dims[0], ortho=False) params = get_layer('ff')[0](options, params, prefix='layer_2', nin=args.dims[0], nout=args.dims[0], ortho=False) dilated_conv = False if dilated_conv: bn = True filter_size = 5 c1 = AtrousConvolution2D(128, filter_size, filter_size, atrous_rate=(1, 1), border_mode='same') c1.build((100, 3, 32, 32)) qw = c1.get_weights() params[_p('c1', 'w')] = qw[0] params[_p('c1', 'b')] = qw[1] if bn: params = bnorm_layer_init((100, 3, 32, 128), params, 'c1_bn') c2 = AtrousConvolution2D(128, filter_size, filter_size, atrous_rate=(2, 2), border_mode='same') c2.build((100, 3, 32, 128)) qw = c2.get_weights() params[_p('c2', 'w')] = qw[0] params[_p('c2', 'b')] = qw[1] if bn: params = bnorm_layer_init((100, 3, 32, 128), params, 'c2_bn') c3 = AtrousConvolution2D(128, filter_size, filter_size, atrous_rate=(4, 4), border_mode='same') c3.build((100, 3, 32, 128)) qw = c3.get_weights() params[_p('c3', 'w')] = qw[0] params[_p('c3', 'b')] = qw[1] if bn: params = bnorm_layer_init((100, 3, 32, 128), params, 'c3_bn') c4_mu = AtrousConvolution2D(128, filter_size, filter_size, atrous_rate=(4, 4), border_mode='same') c4_mu.build((100, 3, 32, 128)) qw = c4_mu.get_weights() params[_p('c4_mu', 'w')] = qw[0] params[_p('c4_mu', 'b')] = qw[1] if bn: params = bnorm_layer_init((100, 3, 32, 128), params, 'c4_mu_bn') c5_mu = AtrousConvolution2D(128, filter_size, filter_size, atrous_rate=(2, 2), border_mode='same') c5_mu.build((100, 3, 32, 128)) qw = c5_mu.get_weights() params[_p('c5_mu', 'w')] = qw[0] params[_p('c5_mu', 'b')] = qw[1] if bn: params = bnorm_layer_init((100, 3, 32, 128), params, 'c5_mu_bn') c6_mu = AtrousConvolution2D(32, filter_size, filter_size, atrous_rate=(1, 1), border_mode='same') c6_mu.build((100, 3, 32, 128)) qw = c6_mu.get_weights() params[_p('c6_mu', 'w')] = qw[0] params[_p('c6_mu', 'b')] = qw[1] c4_s = AtrousConvolution2D(128, filter_size, filter_size, atrous_rate=(4, 4), border_mode='same') c4_s.build((100, 3, 32, 128)) qw = c4_s.get_weights() params[_p('c4_s', 'w')] = qw[0] params[_p('c4_s', 'b')] = qw[1] if bn: params = bnorm_layer_init((100, 3, 32, 128), params, 'c4_s_bn') c5_s = AtrousConvolution2D(128, filter_size, filter_size, atrous_rate=(2, 2), border_mode='same') c5_s.build((100, 3, 32, 128)) qw = c5_s.get_weights() params[_p('c5_s', 'w')] = qw[0] params[_p('c5_s', 'b')] = qw[1] if bn: params = bnorm_layer_init((100, 3, 32, 128), params, 'c5_s_bn') c6_s = AtrousConvolution2D(32, filter_size, filter_size, atrous_rate=(1, 1), border_mode='same') c6_s.build((100, 3, 32, 128)) qw = c6_s.get_weights() params[_p('c6_s', 'w')] = qw[0] params[_p('c6_s', 'b')] = qw[1] if use_conv: bn = True params = ConvLayer(3, 64, 5, 2, params=params, prefix='conv_1', bn=bn) params = ConvLayer(64, 128, 5, 2, params=params, prefix='conv_2', bn=bn) params = ConvLayer(128, 256, 5, 2, params=params, prefix='conv_3', bn=bn) params = get_layer('ff')[0](options, params, prefix='layer_1', nin=4 * 4 * 256, nout=2048, ortho=False) params = get_layer('ff')[0](options, params, prefix='layer_2', nin=2048, nout=2048, ortho=False) params = get_layer('ff')[0](options, params, prefix='layer_3', nin=2048, nout=2048, ortho=False) params = get_layer('ff')[0](options, params, prefix='layer_4', nin=2048, nout=2048, ortho=False) params = get_layer('ff')[0](options, params, prefix='layer_5', nin=2048, nout=4 * 4 * 256, ortho=False) params = ConvLayer(256, 128, 5, -2, params=params, prefix='conv_4_mu', bn=bn) params = ConvLayer(128, 64, 5, -2, params=params, prefix='conv_5_mu', bn=bn) params = ConvLayer(64, 3, 5, -2, params=params, prefix='conv_6_mu') params = ConvLayer(256, 128, 5, -2, params=params, prefix='conv_4_s', bn=bn) params = ConvLayer(128, 64, 5, -2, params=params, prefix='conv_5_s', bn=bn) params = ConvLayer(64, 3, 5, -2, params=params, prefix='conv_6_s') else: #TODO: Ideally, only in the output layer, flag=True should be set. if len(args.dims) == 1: params = get_layer('ff')[0](options, params, prefix='mu_0', nin=args.dims[0], nout=INPUT_SIZE, ortho=False, flag=True) if args.noise == 'gaussian': params = get_layer('ff')[0](options, params, prefix='sigma_0', nin=args.dims[0], nout=INPUT_SIZE, ortho=False) for i in range(len(args.dims) - 1): params = get_layer('ff')[0](options, params, prefix='mu_' + str(i), nin=args.dims[i], nout=args.dims[i + 1], ortho=False) if args.noise == 'gaussian': params = get_layer('ff')[0](options, params, prefix='sigma_' + str(i), nin=args.dims[i], nout=args.dims[i + 1], ortho=False, flag=True) if len(args.dims) > 1: params = get_layer('ff')[0](options, params, prefix='mu_' + str(i + 1), nin=args.dims[i + 1], nout=INPUT_SIZE, ortho=False, flag=True) if args.noise == 'gaussian': params = get_layer('ff')[0](options, params, prefix='sigma_' + str(i + 1), nin=args.dims[i + 1], nout=INPUT_SIZE, ortho=False) return params
def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs): return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')])
def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None): """ Each variable is one time slice of the LSTM m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory), a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context), pctx_ (projected context), dp_/dp_att_ (dropout masks) """ # attention computation # [described in equations (4), (5), (6) in # section "3.1.2 Decoder: Long Short Term Memory Network] pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')]) pctx_ = pctx_ + pstate_[:, None, :] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix, 'U_att')]) + tparams[_p( prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape if options['attn_type'] == 'deterministic': alpha = tensor.nnet.softmax( alpha.reshape([alpha_shp[0], alpha_shp[1]])) # softmax ctx_ = (context * alpha[:, :, None]).sum(1) # current context alpha_sample = alpha # you can return something else reasonable here to debug else: alpha = tensor.nnet.softmax( temperature_c * alpha.reshape([alpha_shp[0], alpha_shp[1]])) # softmax # TODO return alpha_sample if sampling: alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\ + (1.-h_sampling_mask) * alpha else: if argmax: alpha_sample = tensor.cast( tensor.eq( tensor.arange(alpha_shp[1])[None, :], tensor.argmax(alpha, axis=1, keepdims=True)), theano.config.floatX) else: alpha_sample = alpha ctx_ = (context * alpha_sample[:, :, None]).sum( 1) # current context if options['selector']: sel_ = tensor.nnet.sigmoid( tensor.dot(h_, tparams[_p(prefix, 'W_sel')]) + tparams[_p(prefix, 'b_sel')]) sel_ = sel_.reshape([sel_.shape[0]]) ctx_ = sel_[:, None] * ctx_ preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) # Recover the activations to the lstm gates # [equation (1)] i = _slice(preact, 0, dim) f = _slice(preact, 1, dim) o = _slice(preact, 2, dim) if options['use_dropout_lstm']: i = i * _slice(dp_, 0, dim) f = f * _slice(dp_, 1, dim) o = o * _slice(dp_, 2, dim) i = tensor.nnet.sigmoid(i) f = tensor.nnet.sigmoid(f) o = tensor.nnet.sigmoid(o) c = tensor.tanh(_slice(preact, 3, dim)) # compute the new memory/hidden state # if the mask is 0, just copy the previous state c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ rval = [h, c, alpha, alpha_sample, ctx_] if options['selector']: rval += [sel_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre] + pctx_list return rval
def param_init_lstm_cond(options, params, prefix='lstm_cond', nin=None, dim=None, dimctx=None): if nin is None: nin = options['dim'] if dim is None: dim = options['dim'] if dimctx is None: dimctx = options['dim'] # input to LSTM, similar to the above, we stack the matricies for compactness, do one # dot product, and use the slice function below to get the activations for each "gate" W = numpy.concatenate([norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix,'W')] = W # LSTM to LSTM U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix,'U')] = U # bias to LSTM params[_p(prefix,'b')] = numpy.zeros((4 * dim,)).astype('float32') # context to LSTM Wc = norm_weight(dimctx,dim*4) params[_p(prefix,'Wc')] = Wc # attention: context -> hidden Wc_att = norm_weight(dimctx, ortho=False) params[_p(prefix,'Wc_att')] = Wc_att # attention: LSTM -> hidden Wd_att = norm_weight(dim,dimctx) params[_p(prefix,'Wd_att')] = Wd_att # attention: hidden bias b_att = numpy.zeros((dimctx,)).astype('float32') params[_p(prefix,'b_att')] = b_att # optional "deep" attention if options['n_layers_att'] > 1: for lidx in xrange(1, options['n_layers_att']): params[_p(prefix,'W_att_%d'%lidx)] = ortho_weight(dimctx) params[_p(prefix,'b_att_%d'%lidx)] = numpy.zeros((dimctx,)).astype('float32') # attention: U_att = norm_weight(dimctx,1) params[_p(prefix,'U_att')] = U_att c_att = numpy.zeros((1,)).astype('float32') params[_p(prefix, 'c_tt')] = c_att if options['selector']: # attention: selector W_sel = norm_weight(dim, 1) params[_p(prefix, 'W_sel')] = W_sel b_sel = numpy.float32(0.) params[_p(prefix, 'b_sel')] = b_sel return params