def param_init_encoder(options, params, prefix='lstm_encoder'): n_x = options['n_x'] n_h = options['n_h'] W = np.concatenate([uniform_weight(n_x,n_h), uniform_weight(n_x,n_h), uniform_weight(n_x,n_h), uniform_weight(n_x,n_h)], axis=1) params[_p(prefix, 'W')] = W U = np.concatenate([ortho_weight(n_h), ortho_weight(n_h), ortho_weight(n_h), ortho_weight(n_h)], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix,'b')] = zero_bias(4*n_h) # It is observed that setting a high initial forget gate bias for LSTMs can # give slighly better results (Le et al., 2015). Hence, the initial forget # gate bias is set to 3. params[_p(prefix, 'b')][n_h:2*n_h] = 3*np.ones((n_h,)).astype(theano.config.floatX) return params
def gru_layer(tparams, state_below, init_state, options, prefix='gru', mask=None, **kwargs): """ Feedforward pass through GRU """ nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 dim = tparams[_p(prefix,'Ux')].shape[1] if init_state == None: init_state = tensor.alloc(0., n_samples, dim) if mask == None: mask = tensor.alloc(1., state_below.shape[0], 1) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] U = tparams[_p(prefix, 'U')] Ux = tparams[_p(prefix, 'Ux')] def _step_slice(m_, x_, xx_, h_, U, Ux): preact = tensor.dot(h_, U) preact += x_ r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) preactx = tensor.dot(h_, Ux) preactx = preactx * r preactx = preactx + xx_ h = tensor.tanh(preactx) h = u * h_ + (1. - u) * h h = m_[:,None] * h + (1. - m_)[:,None] * h_ return h seqs = [mask, state_below_, state_belowx] _step = _step_slice rval, updates = theano.scan(_step, sequences=seqs, outputs_info = [init_state], non_sequences = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]], name=_p(prefix, '_layers'), n_steps=nsteps, profile=False, strict=True) rval = [rval] return rval
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True): if nin is None: nin = options['dim_proj'] if nout is None: nout = options['dim_proj'] params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) params[_p(prefix, 'b')] = np.zeros((nout,)).astype('float32') return params
def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs): """ Feedforward pass """ return eval(activ)(tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')])
def batch_norm(tparams, input, options, prefix='cnn'): """ layer1_input: n_sample * n_feature 64*20 input_shape: (num of hiddens, number of input features) 200*20 pred_shape: (num of labels, number of hiddens) 2*200 y_recon : n_label *n_sample 2*64 """ input_hat = (input - input.mean(0)) / (input.std(0) + 1.0 / options['L']) input_ = input_hat * tparams[_p(prefix, 'gamma')] + tparams[_p( prefix, 'beta')] return input_
def encoder(tparams, state_below, mask, seq_output=False, prefix='lstm_encoder'): """ state_below: size of n_steps * n_samples * n_x """ n_steps = state_below.shape[0] n_samples = state_below.shape[1] n_h = tparams[_p(prefix,'U')].shape[0] def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ tparams[_p(prefix, 'b')] def _step(m_, x_, h_, c_, U): preact = tensor.dot(h_, U) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) f = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) o = tensor.nnet.sigmoid(_slice(preact, 2, n_h)) c = tensor.tanh(_slice(preact, 3, n_h)) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c seqs = [mask, state_below_] rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples,n_h), tensor.alloc(numpy_floatX(0.), n_samples,n_h)], non_sequences = [tparams[_p(prefix, 'U')]], name=_p(prefix, '_layers'), n_steps=n_steps, strict=True) h_rval = rval[0] if seq_output: return h_rval else: # size of n_samples * n_h return h_rval[-1]
def _init_params(self): shape_io = (self.n_in, self.n_out) if self.orth: if self.n_in != self.n_out : raise ValueError('n_in != n_out when require orth in FeedForward') self.W = ortho_weight(rng=self.rng, shape=shape_io, name=_p(self.pname, 'W')) else: self.W = norm_weight(rng=self.rng, shape=shape_io, name=_p(self.pname, 'W')) self.b = constant_weight(shape=(self.n_out, ), name=_p(self.pname, 'b')) self.params = [self.W, self.b]
def mlp_layer_linear(tparams, layer1_input, prefix='mlp_layer'): """ layer1_input: n_sample * n_feature 64*20 input_shape: (num of hiddens, number of input features) 200*20 pred_shape: (num of labels, number of hiddens) 2*200 y_recon : n_label *n_sample 2*64 """ hidden_2_out = tensor.nnet.sigmoid(tensor.dot(layer1_input, tparams[_p(prefix,'W1')].T) + tparams[_p(prefix,'b1')] ) # 64*200 y_recons = tensor.dot(hidden_2_out, tparams[_p(prefix,'V1')].T) + tparams[_p(prefix,'c1')] #y_recons = tensor.tanh(y_recons) * 10 # avoid numerical issues/label smoothing #y_recons = tensor.nnet.softmax(y_recons) # 64*2 return y_recons
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True): """ Affine transformation + point-wise nonlinearity """ if nin == None: nin = options['dim_proj'] if nout == None: nout = options['dim_proj'] params[_p(prefix,'W')] = norm_weight(nin, nout, ortho=ortho) params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32') return params
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True): """ Affine transformation + point-wise nonlinearity """ if nin == None: nin = options['dim_proj'] if nout == None: nout = options['dim_proj'] params[_p(prefix,'W')] = xavier_weight(nin, nout) params[_p(prefix,'b')] = numpy.zeros((nout,)).astype('float32') return params
def decoder_layer(tparams, state_below, prefix='decoder_lstm'): """ state_below: size of n_steps * n_samples * n_x """ nsteps = state_below.shape[0] n_h = tparams[_p(prefix, 'U')].shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(x_, h_, c_, U): preact = tensor.dot(h_, U) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) f = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) o = tensor.nnet.sigmoid(_slice(preact, 2, n_h)) c = tensor.tanh(_slice(preact, 3, n_h)) c = f * c_ + i * c h = o * tensor.tanh(c) return h, c state_below_ = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] seqs = [state_below_] rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[ tensor.alloc(numpy_floatX(0.), n_samples, n_h), tensor.alloc(numpy_floatX(0.), n_samples, n_h) ], non_sequences=[tparams[_p(prefix, 'U')]], name=_p(prefix, '_layers'), n_steps=nsteps, strict=True) h_rval = rval[0] return h_rval
def param_init_batch_norm(input_shape, params, prefix='cnn'): """ input_shape: (num of hiddens, number of input features) pred_shape: (num of labels, number of hiddens) """ beta = np.ones((input_shape[1], ), dtype=theano.config.floatX) * 0.01 gamma = np.ones((input_shape[1], ), dtype=theano.config.floatX) * 0.1 params[_p(prefix, 'beta')] = beta params[_p(prefix, 'gamma')] = gamma return params
def _init_params(self): shape_xh = (self.n_in * 3, self.n_hids) shape_hh = (self.n_hids * 3, self.n_hids) self.W_x = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_x')) self.b = constant_weight(shape=(self.n_hids * 3, ), name=_p(self.pname, 'b')) self.W_h = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_h')) self.params = [self.W_x, self.W_h, self.b] self.GRU_op = mkl_gru.GRU(hid=self.n_hids, return_sequences=True, max_len=self.max_len) self.h_init_state = numpy.zeros((80, 1000), numpy.float64)
def param_init_fflayer(self, options, params, prefix='ff', nin=None, nout=None): if nin == None: nin = options['dim_proj'] if nout == None: nout = options['dim_proj'] params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01) params[_p(prefix, 'b')] = numpy.zeros((nout, )).astype('float32') return params
def _init_params(self): shape_io = (self.n_in_0, self.n_out) if self.orth: if self.n_in_0 != self.n_out: raise ValueError('n_in != n_out when require orth in FeedForward') self.W = ortho_weight(rng=self.rng, shape=shape_io, name=_p(self.pname, 'W')) else: self.W = norm_weight(rng=self.rng, shape=shape_io, name=_p(self.pname, 'W')) self.params = [self.W] self.ff = FeedForward(self.n_in_1, self.n_out, orth=self.orth, rng=self.rng, name=_p(self.pname, 'FF_W') ) self.params.extend(self.ff.params)
def param_init_encoder(filter_shape, params, prefix='cnn_encoder'): """ filter_shape: (number of filters, num input feature maps, filter height, filter width) image_shape: (batch_size, num input feature maps, image height, image width) """ W = np.asarray(rng.uniform(low=-0.01,high=0.01,size=filter_shape),dtype=theano.config.floatX) b = np.zeros((filter_shape[0],), dtype=theano.config.floatX) params[_p(prefix,'W')] = W params[_p(prefix,'b')] = b return params
def apply_pyramid(self, state_below, mask_below=None, init_state=None, context=None): ''' state_below: shape=[seq_len, batch, n_in] init_state: shape=[batch, seq_len, hid] ''' n_steps = state_below.shape[0] if state_below.ndim == 3: batch_size = state_below.shape[1] else: batch_size = 1 state_below = state_below.reshape((n_steps, batch_size, state_below.shape[1])) if mask_below is None: mask_below = T.alloc(numpy.float32(1.), n_steps, 1) if self.with_context: assert context if init_state is None: init_state = T.tanh(T.dot(context, self.W_c_init)) c_z = T.dot(context, self.W_cz) c_r = T.dot(context, self.W_cr) c_h = T.dot(context, self.W_ch) non_sequences = [c_z, c_r, c_h] rval, updates = theano.scan(self._step_context, sequences=[state_below, mask_below], non_sequences=non_sequences, outputs_info=[init_state], name=_p(self.pname, 'layers'), n_steps=n_steps) else: if init_state is None: init_state = T.alloc(numpy.float32(0.), batch_size, n_steps, self.n_hids) state_below_xh = T.dot(state_below, self.W_xh) + self.b_h state_below_xzr = T.dot(state_below, self.W_xzr) + self.b_zr step_idx = T.arange(n_steps) sequences = [state_below_xh, state_below_xzr, mask_below, step_idx] outputs_info=[init_state] non_sequences = [] rval, updates = theano.scan(self._pyramid_step, sequences=sequences, outputs_info=outputs_info, non_sequences=non_sequences, name=_p(self.pname, 'layers'), n_steps=n_steps) self.output = rval return self.output
def param_init_decoder(options, params, prefix='decoder_vanilla'): n_x = options['n_x'] n_h = options['n_h'] W = uniform_weight(n_x, n_h) params[_p(prefix, 'W')] = W U = ortho_weight(n_h) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = zero_bias(n_h) return params
def param_init_action_response_layer(options, params, constraints, prefix='ar', nin=0, rng=None, unif_range=0.2, level=0, **kwargs): ''' Action response layers. ''' rng = init_rng(rng) n_features = options['hidden_units'][-1] if options['shared_ld']: params, constraints = init_level_dist(params, unif_range, nin, rng, constraints) else: if level > 0: params[_p(prefix, 'ld')] = floatx(rng.uniform(size=(level), low=0.1, high=0.9)) params[_p(prefix, 'ld')] /= params[_p(prefix, 'ld')].sum() constraints['simplex'] = constraints.get('simplex', []) + [_p(prefix, 'ld')] initial_Wf = numpy.zeros(n_features) initial_Wf += floatx(rng.uniform(size=(n_features), low=0., high=unif_range)) initial_Wf /= initial_Wf.sum() if level == 0: params[_p(prefix, 'Wf')] = floatx(initial_Wf) constraints['simplex'] = constraints.get('simplex', []) + [_p(prefix, 'Wf')] if level > 0: params[_p(prefix, 'W_h')] = floatx(rng.uniform(size=(1+options['hidden_units'][-1]), low=-0.01, high=0.01)) if level > 0: params[_p(prefix, 'lam')] = floatx(1.0) return params, constraints
def deconv_depool2(layer0_input, tparams, options, prefix): if prefix == 'd': depool_out = depool_repeat(layer0_input, options['e_pool_size']) elif prefix == 'd2': depool_out = depool_repeat(layer0_input, options['e2_pool_size']) s = int(np.floor(options[_p(prefix, 'Nt')] / 2.)) h = int((2 * options[_p(prefix, 'K')] - 2) / 2.) deconv_out = conv.conv2d(input=depool_out.dimshuffle(0, 'x', 1, 2), filters=tparams[_p(prefix, 'W')], filter_shape=options[_p(prefix, 'filter_shape')], border_mode='full')[:, :, h, s - 1:-s] doutput = (deconv_out + tparams[_p(prefix, 'bias')].dimshuffle('x', 0, 'x') ) #.reshape((deconv_out.shape[0],options['C'],options['T'])) return doutput
def param_init_lstm(self, params, nin, dim, prefix='lstm'): assert prefix is not None # Stack the weight matricies for faster dot prods W = np.concatenate([norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W # to_lstm_W:(512,2048) U = np.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U # to_lstm_U:(512,2048) params[_p(prefix, 'b')] = np.zeros((4*dim,)).astype('float32') # to_lstm_b:(2048,) return params
def encoder(tparams, layer0_input, filter_shape, pool_size, prefix='cnn_encoder'): """ filter_shape: (number of filters, num input feature maps, filter height, filter width) image_shape: (batch_size, num input feature maps, image height, image width) """ conv_out = conv.conv2d(input=layer0_input, filters=tparams[_p(prefix,'W')], filter_shape=filter_shape) conv_out_tanh = tensor.tanh(conv_out + tparams[_p(prefix,'b')].dimshuffle('x', 0, 'x', 'x')) output = pool.pool_2d(input=conv_out_tanh, ds=pool_size, ignore_border=True) return output.flatten(2)
def param_init(self): if not self.initialized: # call object specific param_init self.__param_init__() # set object params with theano shared for (k, v) in self.params.iteritems(): setattr(self, k, theano.shared( v, name=_p(self.get_prefix(), k), borrow=True)) # fill params with the theano shared self._params = OrderedDict([ (_p(self.get_prefix(), k), getattr(self, k)) for (k, v) in self.params.iteritems() if self.params]) self.initialized = True return self.params
def _init_params(self): shape_i0o = (self.n_in_0, self.n_out) shape_i1o = (self.n_in_1, self.n_out) if self.orth: if self.n_in_0 != self.n_out or self.n_in_1 != self.n_out: raise ValueError('n_in != n_out when require orth in FeedForward') self.W0 = ortho_weight(rng=self.rng, shape=shape_i0o, name=_p(self.pname, 'W0')) self.W1 = ortho_weight(rng=self.rng, shape=shape_i1o, name=_p(self.pname, 'W1')) else: self.W0 = norm_weight(rng=self.rng, shape=shape_i0o, name=_p(self.pname, 'W0')) self.W1 = norm_weight(rng=self.rng, shape=shape_i1o, name=_p(self.pname, 'W1')) self.b = constant_weight(shape=(self.n_out, ), name=_p(self.pname, 'b')) self.params = [self.W0, self.W1, self.b]
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) dim_proj = options['dim_proj'] rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[ tensor.alloc(numpy_floatX(0.), n_samples, dim_proj), tensor.alloc(numpy_floatX(0.), n_samples, dim_proj) ], name=_p(prefix, '_layers'), n_steps=nsteps) # outputs_info include h_ and c_ # return only hidden states, so return rval[0] return rval[0]
def action_response_layer(tparams, features, options, payoff=None, prefix='ar', opposition=None, level=0, **kwargs): """ action_response_layer: tensor3, (tensor3) -> matrix features, (opposition) -> ar_layer Tensor dims: features: iter, action_payoff, feature opposition: iter, level, prob of action output: iter, prob of action Probability of an action given features and beliefs about opposition. """ n, f, i = features.shape # Weights on opposition players if level == 0: w_feat = tparams[_p(prefix, 'Wf')] weighted_features = tensor.sum(features * w_feat.dimshuffle('x', 0, 'x'), axis=1) ar = weighted_features return ar, weighted_features, None else: weighted_features = None lam = tparams[_p(prefix, 'lam')] if options['shared_ld']: level_dist = tparams['ld'] ld = level_dist ld += floatx(1e-32) # avoid divide by zero ld = ld[0:level] ld /= ld.sum() else: ld = tparams[_p(prefix, 'ld')] ld += floatx(1e-32) ld /= ld.sum() # U * AR * ld (where * is matrix product) weighting = opposition * ld.dimshuffle('x', 0, 'x') prob_a = tensor.sum(weighting, axis=1) payoff = payoff * tparams[_p(prefix, 'W_h')].dimshuffle('x', 0, 'x', 'x') payoff = tensor.sum(payoff,axis=1) br = tensor.sum(payoff * prob_a.dimshuffle(0, 'x', 1), axis=2) out = br # remove weighted_features, br when done with visualisation return tensor.nnet.softmax(out * lam), weighted_features, br
def get_all_params(self, prev_prefix=None): '''Return an OrderedDict with all the parameters. Return an OrderedDict with the parameters of self and of every child, renamed so that they contain the full inclusion path in their name. Corresponds to: for k, v in self.params.iteritems(): part = [(k, v)] for child in unroll(self.children): if child: for k, v in child.get_all_params().iteritems(): part += [(_p(self.get_prefix(), k), v)] return OrderedDict(part) ''' if prev_prefix: self.baseprefix = _p(prev_prefix, self.baseprefix) if self.children == []: return self.param_init() else: return OrderedDict([(k, v) for k, v in self.param_init().iteritems()] + [(k, v) for child in unroll(self.children) if child for k, v in child.get_all_params( self.get_prefix()).iteritems()])
def get_all_params(self, prev_prefix=None): '''Return an OrderedDict with all the parameters. Return an OrderedDict with the parameters of self and of every child, renamed so that they contain the full inclusion path in their name. Corresponds to: for k, v in self.params.iteritems(): part = [(k, v)] for child in unroll(self.children): if child: for k, v in child.get_all_params().iteritems(): part += [(_p(self.get_prefix(), k), v)] return OrderedDict(part) ''' if prev_prefix: self.baseprefix = _p(prev_prefix, self.baseprefix) if self.children == []: return self.param_init() else: return OrderedDict([ (k, v) for k, v in self.param_init().iteritems()] + [(k, v) for child in unroll(self.children) if child for k, v in child.get_all_params( self.get_prefix()).iteritems()])
def __init__(self, mkl, n_in, n_hids, n_cdim, maxout_part=2, name='rnn_decoder', with_attention=True, with_coverage=False, coverage_dim=1, coverage_type='linguistic', max_fertility=2, with_context_gate=False): self.n_in = n_in self.n_hids = n_hids self.n_cdim = n_cdim self.maxout_part = maxout_part self.pname = name self.with_attention = with_attention self.with_coverage = with_coverage self.coverage_dim = coverage_dim assert coverage_type in ['linguistic', 'neural'], 'Coverage type must be either linguistic or neural' self.coverage_type = coverage_type self.max_fertility = max_fertility self.with_context_gate = with_context_gate self.mkl = mkl self._init_params() #mkl decoder self.attention_ = Attention_(self.n_hids, name=_p(name, '_attention')) self.GRU_op = mkl_gru.GRU(hid=self.n_hids, return_sequences=True)
def param_init(params, nin, dim, prefix='lstm'): assert prefix is not None # Stack the weight matricies for faster dot prods W = np.concatenate([norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix, 'W')] = W U = np.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = np.zeros((4 * dim,)).astype('float32') return params
def apply(self, state_below, mask_below=None, init_state=None, context=None): if K.ndim(state_below) == 2: state_below = K.expand_dims(state_below, 1) if mask_below is None: mask_below = K.ones_like(K.sum(state_below, axis=2, keepdims=True)) if init_state is None: # nb_samples,n_hids init_state = K.repeat_elements(K.expand_dims(K.zeros_like(K.sum(state_below, axis=[0, 2]))), self.n_hids, axis=1) print('init state ',K.ndim(init_state)) state_below_xh = K. dot(state_below, self.W_xh) state_below_xz = K. dot(state_below, self.W_xz) state_below_xr = K.dot(state_below, self.W_xr) sequences = [state_below_xh, state_below_xz, state_below_xr, mask_below] if K._BACKEND == 'theano': fn = lambda x_h, x_z, x_r, x_m, h_tm1: self._step(x_h, x_z, x_r, x_m, h_tm1) else: fn = lambda h_tm1, (x_h, x_z, x_r, x_m): self._step(x_h, x_z, x_r, x_m, h_tm1) rval = K.scan(fn, sequences=sequences, outputs_initials=init_state, name=_p(self.pname, 'layers')) self.output = rval return self.output
def _lstm(m_, x_, h_, c_, prefix='lstm_en'): preact = tensor.dot(x_, tparams[_p(prefix, 'W')]) + tparams[_p( prefix, 'b')] preact += tensor.dot(h_, tparams[_p(prefix, 'U')]) i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c
def deconv_depool(layer0_input, tparams, options, prefix): if prefix == 'd': depool_out = depool_repeat(layer0_input, options['e_pool_size']) elif prefix == 'd2': depool_out = depool_repeat(layer0_input, options['e2_pool_size']) s = int(np.floor(options[_p(prefix, 'Nt')] / 2.)) _W = get_filter(tparams, options, prefix).astype(theano.config.floatX) deconv_out = conv.conv2d(input=depool_out.dimshuffle(0, 1, 'x', 2), filters=_W, filter_shape=options[_p(prefix, 'filter_shape')], border_mode='full')[:, :, :, s - 1:-s] doutput = ( deconv_out + tparams[_p(prefix, 'bias')].dimshuffle('x', 0, 'x', 'x')).reshape( (deconv_out.shape[0], options['C'], options['T'])) return doutput
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n+1) * dim] return _x[:, n * dim:(n+1) * dim] def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) dim_proj = options['dim_proj'] rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, dim_proj), tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)], name=_p(prefix, '_layers'), n_steps=nsteps) return rval[0][-1]
def decoder(tparams, state_below, z, mask=None, prefix='decoder'): """ state_below: size of n_steps * n_samples * n_x z: size of n_samples * n_z """ n_steps = state_below.shape[0] n_samples = state_below.shape[1] n_h = tparams[_p(prefix,'U')].shape[0] # n_samples * n_h state_belowx0 = tensor.dot(z, tparams[_p(prefix, 'C0')]) + \ tparams[_p(prefix, 'b0')] h0 = tensor.tanh(state_belowx0) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] # n_steps * n_samples * n_h state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ tensor.dot(z, tparams[_p(prefix, 'C')]) + tparams[_p(prefix, 'b')] def _step(m_, x_, h_, c_, U): preact = tensor.dot(h_, U) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) f = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) o = tensor.nnet.sigmoid(_slice(preact, 2, n_h)) c = tensor.tanh(_slice(preact, 3, n_h)) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c seqs = [mask[:n_steps-1], state_below_[:n_steps-1]] rval, updates = theano.scan(_step, sequences=seqs, outputs_info = [h0,tensor.alloc(numpy_floatX(0.), n_samples,n_h)], non_sequences = [tparams[_p(prefix, 'U')]], name=_p(prefix, '_layers'), n_steps=n_steps-1, strict=True) h0x = tensor.shape_padleft(h0) h_rval = rval[0] return tensor.concatenate((h0x,h_rval))
def apply(self, state_below, mask_below=None, init_state=None, context=None): n_steps = state_below.shape[0] if state_below.ndim == 3: batch_size = state_below.shape[1] else: batch_size = 1 state_below = state_below.reshape((n_steps, batch_size, state_below.shape[1])) if mask_below is None: mask_below = T.alloc(numpy.float32(1.), n_steps, 1) if self.with_context: assert context if init_state is None: init_state = T.tanh(T.dot(context, self.W_c_init)) c_z = T.dot(context, self.W_cz) c_r = T.dot(context, self.W_cr) c_h = T.dot(context, self.W_ch) non_sequences = [c_z, c_r, c_h] rval, updates = theano.scan(self._step_context, sequences=[state_below, mask_below], non_sequences=non_sequences, outputs_info=[init_state], name=_p(self.pname, 'layers'), n_steps=n_steps) else: if init_state is None: init_state = T.alloc(numpy.float32(0.), batch_size, self.n_hids) state_below_xh = T.dot(state_below, self.W_xh) state_below_xz = T.dot(state_below, self.W_xz) state_below_xr = T.dot(state_below, self.W_xr) sequences = [state_below_xh, state_below_xz, state_below_xr, mask_below] rval, updates = theano.scan(self._step, sequences=sequences, outputs_info=[init_state], name=_p(self.pname, 'layers'), n_steps=n_steps) self.output = rval return self.output
def build_encoder(tparams, options): x = tensor.matrix('x', dtype='int32') y = tensor.matrix('y', dtype='int32') layer0_input = tparams['Wemb'][tensor.cast(x.flatten(), dtype='int32')].reshape( (x.shape[0], 1, x.shape[1], tparams['Wemb'].shape[1])) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input, filter_shape=filter_shape, pool_size=pool_size, prefix=_p('cnn_encoder', i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input_x = tensor.concatenate(layer1_inputs, 1) layer0_input = tparams['Wemb'][tensor.cast(y.flatten(), dtype='int32')].reshape( (y.shape[0], 1, y.shape[1], tparams['Wemb'].shape[1])) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input, filter_shape=filter_shape, pool_size=pool_size, prefix=_p('cnn_encoder', i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input_y = tensor.concatenate(layer1_inputs, 1) feat_x = l2norm(layer1_input_x) feat_y = l2norm(layer1_input_y) return [x, y], feat_x, feat_y
def param_init_decoder(options, params, prefix='decoder_gru'): n_x = options['n_x'] n_h = options['n_h'] W = np.concatenate([uniform_weight(n_x, n_h), uniform_weight(n_x, n_h)], axis=1) params[_p(prefix, 'W')] = W U = np.concatenate([ortho_weight(n_h), ortho_weight(n_h)], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = zero_bias(2 * n_h) Wx = uniform_weight(n_x, n_h) params[_p(prefix, 'Wx')] = Wx Ux = ortho_weight(n_h) params[_p(prefix, 'Ux')] = Ux params[_p(prefix, 'bx')] = zero_bias(n_h) params[_p(prefix, 'b0')] = zero_bias(n_h) return params
def decoder_layer(tparams, state_below, prefix='decoder_gru'): """ state_below: size of n_steps * n_x """ n_steps = state_below.shape[0] n_h = tparams[_p(prefix, 'Ux')].shape[1] state_belowx0 = tparams[_p(prefix, 'b0')] h0vec = tensor.tanh(state_belowx0) h0 = h0vec.dimshuffle('x', 0) def _slice(_x, n, dim): return _x[n * dim:(n + 1) * dim] state_below_ = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] state_belowx = tensor.dot(state_below, tparams[_p( prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] def _step_slice(x_, xx_, h_, U, Ux): preact = tensor.dot(h_, U) preact += x_ r = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) u = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) preactx = tensor.dot(h_, Ux) preactx = preactx * r preactx = preactx + xx_ h = tensor.tanh(preactx) h = u * h_ + (1. - u) * h return h seqs = [state_below_[:n_steps - 1], state_belowx[:n_steps - 1]] _step = _step_slice rval, updates = theano.scan( _step, sequences=seqs, outputs_info=[h0vec], non_sequences=[tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]], name=_p(prefix, '_layers'), n_steps=n_steps - 1) #h0x = h0.dimshuffle('x',0,1) return tensor.concatenate((h0, rval))
def param_init_decoder(options, params, prefix='decoder_gru'): n_x = options['n_x'] n_h = options['n_h'] W = np.concatenate([uniform_weight(n_x,n_h), uniform_weight(n_x,n_h)], axis=1) params[_p(prefix,'W')] = W U = np.concatenate([ortho_weight(n_h), ortho_weight(n_h)], axis=1) params[_p(prefix,'U')] = U params[_p(prefix,'b')] = zero_bias(2*n_h) Wx = uniform_weight(n_x, n_h) params[_p(prefix,'Wx')] = Wx Ux = ortho_weight(n_h) params[_p(prefix,'Ux')] = Ux params[_p(prefix,'bx')] = zero_bias(n_h) params[_p(prefix,'b0')] = zero_bias(n_h) return params
def decoder_layer(tparams, state_below, prefix='decoder_gru'): """ state_below: size of n_steps * n_x """ n_steps = state_below.shape[0] n_h = tparams[_p(prefix,'Ux')].shape[1] state_belowx0 = tparams[_p(prefix, 'b0')] h0vec = tensor.tanh(state_belowx0) h0 = h0vec.dimshuffle('x',0) def _slice(_x, n, dim): return _x[n*dim:(n+1)*dim] state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] def _step_slice(x_, xx_, h_, U, Ux): preact = tensor.dot(h_, U) preact += x_ r = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) u = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) preactx = tensor.dot(h_, Ux) preactx = preactx * r preactx = preactx + xx_ h = tensor.tanh(preactx) h = u * h_ + (1. - u) * h return h seqs = [state_below_[:n_steps-1], state_belowx[:n_steps-1]] _step = _step_slice rval, updates = theano.scan(_step, sequences=seqs, outputs_info = [h0vec], non_sequences = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]], name=_p(prefix, '_layers'), n_steps=n_steps-1) #h0x = h0.dimshuffle('x',0,1) return tensor.concatenate((h0,rval))
def __init__(self, n_in, n_hids, table, mkl, name='rnn_encoder', max_len=None): # lookup table self.table = table # embedding dimension self.n_in = n_in # hidden state dimension self.n_hids = n_hids self.mkl = mkl self.params = [] self.layers = [] self.max_len = max_len if self.mkl == True: print('with mkl') self.forward = MKL_GRU(self.n_in, self.n_hids, name=_p(name, 'forward'), max_len=max_len) else: print('with no mkl') self.forward = GRU(self.n_in, self.n_hids, name=_p(name, 'forward')) self.layers.append(self.forward) if self.mkl == True: self.backward = MKL_GRU(self.n_in, self.n_hids, name=_p(name, 'backward'), max_len=max_len) else: self.backward = GRU(self.n_in, self.n_hids, name=_p(name, 'backward')) self.layers.append(self.backward) for layer in self.layers: self.params.extend(layer.params)
def encoder(tparams, state_below, mask, seq_output=False, prefix='gru_encoder'): """ state_below: size of n_steps * n_samples * n_x """ n_steps = state_below.shape[0] n_samples = state_below.shape[1] n_h = tparams[_p(prefix,'Ux')].shape[1] def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ tparams[_p(prefix, 'b')] state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ tparams[_p(prefix, 'bx')] def _step(m_, x_, xx_, h_, U, Ux): preact = tensor.dot(h_, U) preact += x_ r = tensor.nnet.sigmoid(_slice(preact, 0, n_h)) u = tensor.nnet.sigmoid(_slice(preact, 1, n_h)) preactx = tensor.dot(h_, Ux) preactx = preactx * r preactx = preactx + xx_ h = tensor.tanh(preactx) h = u * h_ + (1. - u) * h h = m_[:,None] * h + (1. - m_)[:,None] * h_ return h seqs = [mask, state_below_, state_belowx] rval, updates = theano.scan(_step, sequences=seqs, outputs_info = [tensor.alloc(numpy_floatX(0.), n_samples, n_h)], non_sequences = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]], name=_p(prefix, '_layers'), n_steps=n_steps, strict=True) if seq_output: return rval else: # size of n_samples * n_h return rval[-1]
def param_init(self): if not self.initialized: # call object specific param_init self.__param_init__() # set object params with theano shared for (k, v) in self.params.iteritems(): setattr( self, k, theano.shared(v, name=_p(self.get_prefix(), k), borrow=True)) # fill params with the theano shared self._params = OrderedDict([(_p(self.get_prefix(), k), getattr(self, k)) for (k, v) in self.params.iteritems() if self.params]) self.initialized = True return self.params
def param_init_gru(options, params, prefix='gru', nin=None, dim=None): if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] # embedding to gates transformation weights, biases W = np.concatenate([norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W params[_p(prefix, 'b')] = np.zeros((2 * dim,)).astype('float32') # recurrent transformation weights for gates U = np.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U # embedding to hidden state proposal weights, biases Wx = norm_weight(nin, dim) params[_p(prefix, 'Wx')] = Wx params[_p(prefix, 'bx')] = np.zeros((dim,)).astype('float32') # recurrent transformation weights for hidden state proposal Ux = ortho_weight(dim) params[_p(prefix, 'Ux')] = Ux return params
def _init_params(self): shape_xh = (self.n_in, self.n_hids) shape_xh4 = (self.n_in, 4*self.n_hids) shape_hh = (self.n_hids, self.n_hids) shape_hh4 = (self.n_hids, 4*self.n_hids) self.W_pre_x = norm_weight(rng=self.rng, shape=shape_xh4, name=_p(self.pname, 'W_pre_x')) self.W_h = multi_orth(rng=self.rng, size=shape_hh4, name=_p(self.pname, 'W_h')) b_i = constant_weight(share=False, shape=(self.n_hids, ), name=_p(self.pname, 'b_i')) b_f = constant_weight(share=False, value=1., shape=(self.n_hids, ), name=_p(self.pname, 'b_f')) b_o = constant_weight(share=False, shape=(self.n_hids, ), name=_p(self.pname, 'b_o')) b_c = constant_weight(share=False, shape=(self.n_hids, ), name=_p(self.pname, 'b_c')) b_ifoc = numpy.concatenate([b_i, b_f, b_o, b_c], axis=0) self.b_pre_x = theano.shared(value=b_ifoc, borrow=True, name=_p(self.pname, 'b_pre_x')) self.params += [self.W_pre_x, self.W_h, self.b_pre_x] if self.with_context: raise NotImplementedError if self.with_begin_tag: self.struct_begin_tag = constant_weight(shape=(self.n_hids,), value=0., name=_p(self.pname, 'struct_begin_tag')) self.params += [self.struct_begin_tag] if self.with_end_tag: self.struct_end_tag = constant_weight(shape=(self.n_in,), value=0., name=_p(self.pname, 'struct_end_tag')) self.params += [self.struct_end_tag] if self.n_att_ctx: self.lstm_combine_ctx_h = LSTM(self.n_att_ctx, self.n_hids, rng=self.rng, name=_p(self.pname, 'lstm_combine_ctx_h')) self.params.extend(self.lstm_combine_ctx_h.params) self.attention = ATTENTION(self.n_hids, self.rng, name=_p(self.pname, 'att_ctx')) self.params.extend(self.attention.params) if self.seq_pyramid: self.pyramid_on_seq = LSTM(self.n_att_ctx, self.n_att_ctx, rng=self.rng, name=_p(self.pname, 'pyramid_on_seq')) self.params.extend(self.pyramid_on_seq.params) self.ff_pyramid2ctx = FeedForward(self.n_att_ctx, self.n_hids, name=_p(self.pname, 'ff_pyramid2ctx')) self.params.extend(self.ff_pyramid2ctx.params)
def apply_seq_pyramid(self, state_below, state_below_p, mask_below=None, init_state_h=None, init_state_c=None, init_state_hp=None, init_state_cp=None, context=None): ''' state_below: shape=[seq_len, batch, n_in] state_below_p: shape=[seq_len, batch, n_in_p] init_state: shape=[batch, seq_len, hid] ''' n_steps = state_below.shape[0] if state_below.ndim == 3: batch_size = state_below.shape[1] else: batch_size = 1 state_below = state_below.reshape((n_steps, batch_size, state_below.shape[1])) state_below_p = state_below_p.reshape((n_steps, batch_size, state_below_p.shape[1])) if mask_below is None: mask_below = T.alloc(numpy.float32(1.), n_steps, 1) if self.with_context: raise NotImplementedError else: if init_state_h is None: init_state_h = T.alloc(numpy.float32(0.), batch_size, self.n_hids) if init_state_c is None: init_state_c = T.alloc(numpy.float32(0.), batch_size, self.n_hids) if init_state_hp is None: init_state_hp = T.alloc(numpy.float32(0.), batch_size, n_steps, self.n_hids) if init_state_cp is None: init_state_cp = T.alloc(numpy.float32(0.), batch_size, n_steps, self.n_hids) state_below_pre = T.dot(state_below, self.W_pre_x) + self.b_pre_x state_below_p_pre = T.dot(state_below_p, self.pyramid_on_seq.W_pre_x) + self.pyramid_on_seq.b_pre_x step_idx = T.arange(n_steps) sequences = [state_below_pre, state_below_p_pre, mask_below, step_idx] outputs_info = [init_state_h, init_state_c, init_state_hp, init_state_cp] non_sequences = [] rval, updates = theano.scan(self._seq_pyramid_step, sequences=sequences, outputs_info=outputs_info, non_sequences=non_sequences, name=_p(self.pname, 'layers'), n_steps=n_steps) self.output = rval return self.output
def param_init_lstm(options, params, prefix='lstm'): """Init the LSTM parameter :options: TODO :params: TODO :prefix: TODO :returns: TODO """ W = numpy.concatenate([ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj'])], axis=1) params[_p(prefix, 'W')] = W U = numpy.concatenate([ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj'])], axis=1) params[_p(prefix, 'U')] = U b = numpy.zeros((4 * options['dim_proj'],)) params[_p(prefix, 'b')] = b.astype(config.floatX) return params
def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c
def init_params(options,W): params = OrderedDict() # W is initialized by the pretrained word embedding params['Wemb'] = W.astype(config.floatX) # otherwise, W will be initialized randomly # n_words = options['n_words'] # n_x = options['n_x'] # params['Wemb'] = uniform_weight(n_words,n_x) length = len(options['filter_shapes']) for idx in range(length): params = param_init_encoder(options['filter_shapes'][idx],params,prefix=_p('cnn_encoder',idx)) n_h = options['feature_maps'] * length params['Wy'] = uniform_weight(n_h,options['n_y']) params['by'] = zero_bias(options['n_y']) return params
def param_init_gru(options, params, prefix='gru', nin=None, dim=None): """ Gated Recurrent Unit (GRU) """ if nin == None: nin = options['dim_proj'] if dim == None: dim = options['dim_proj'] W = numpy.concatenate([norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix,'W')] = W params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32') U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix,'U')] = U Wx = norm_weight(nin, dim) params[_p(prefix,'Wx')] = Wx Ux = ortho_weight(dim) params[_p(prefix,'Ux')] = Ux params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32') return params
def gru_cond_layer(tparams, state_below, options, prefix='gru', mask=None, context=None, one_step=False, init_memory=None, init_state=None, context_mask=None, **kwargs): assert context, 'Context must be provided' if one_step: assert init_state, 'previous state must be provided' nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 # mask if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) dim = tparams[_p(prefix, 'Wcx')].shape[1] # initial/previous state if init_state is None: init_state = tensor.alloc(0., n_samples, dim) # projected context assert context.ndim == 3, \ 'Context must be 3-d: #annotation x #sample x dim' pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) + \ tparams[_p(prefix, 'b_att')] def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] # projected x state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ tparams[_p(prefix, 'bx')] state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ tparams[_p(prefix, 'b')] def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_, U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl): preact1 = tensor.dot(h_, U) preact1 += x_ preact1 = tensor.nnet.sigmoid(preact1) r1 = _slice(preact1, 0, dim) u1 = _slice(preact1, 1, dim) preactx1 = tensor.dot(h_, Ux) preactx1 *= r1 preactx1 += xx_ h1 = tensor.tanh(preactx1) h1 = u1 * h_ + (1. - u1) * h1 h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ # attention pstate_ = tensor.dot(h1, W_comb_att) pctx__ = pctx_ + pstate_[None, :, :] # pctx__ += xc_ pctx__ = tensor.tanh(pctx__) alpha = tensor.dot(pctx__, U_att) + c_tt alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) alpha = tensor.exp(alpha) if context_mask: alpha = alpha * context_mask alpha = alpha / alpha.sum(0, keepdims=True) ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context preact2 = tensor.dot(h1, U_nl) + b_nl preact2 += tensor.dot(ctx_, Wc) preact2 = tensor.nnet.sigmoid(preact2) r2 = _slice(preact2, 0, dim) u2 = _slice(preact2, 1, dim) preactx2 = tensor.dot(h1, Ux_nl) + bx_nl preactx2 *= r2 preactx2 += tensor.dot(ctx_, Wcx) h2 = tensor.tanh(preactx2) h2 = u2 * h1 + (1. - u2) * h2 h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 return h2, ctx_, alpha.T # pstate_, preact, preactx, r, u seqs = [mask, state_below_, state_belowx] # seqs = [mask, state_below_, state_belowx, state_belowc] _step = _step_slice shared_vars = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Wc')], tparams[_p(prefix, 'W_comb_att')], tparams[_p(prefix, 'U_att')], tparams[_p(prefix, 'c_tt')], tparams[_p(prefix, 'Ux')], tparams[_p(prefix, 'Wcx')], tparams[_p(prefix, 'U_nl')], tparams[_p(prefix, 'Ux_nl')], tparams[_p(prefix, 'b_nl')], tparams[_p(prefix, 'bx_nl')]] if one_step: rval = _step(*(seqs + [init_state, None, None, pctx_, context] + shared_vars)) else: rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[init_state, tensor.alloc(0., n_samples, context.shape[2]), tensor.alloc(0., n_samples, context.shape[0])], non_sequences=[pctx_, context] + shared_vars, name=_p(prefix, '_layers'), n_steps=nsteps, profile=profile, strict=True) return rval
def get_prefix(self): if self.prefix is '': return self.baseprefix else: return _p(self.baseprefix, self.prefix)
def _pname(self, name): return _p(self.get_prefix(), name)
def _init_params(self): shape_xh = (self.n_in, self.n_hids) shape_xh2 = (self.n_in, 2*self.n_hids) shape_hh = (self.n_hids, self.n_hids) shape_hh2 = (self.n_hids, 2*self.n_hids) self.W_xzr = norm_weight(rng=self.rng, shape=shape_xh2, name=_p(self.pname, 'W_xzr')) self.W_xh = norm_weight(rng=self.rng, shape=shape_xh, name=_p(self.pname, 'W_xh')) self.b_zr = constant_weight(shape=(2*self.n_hids, ), name=_p(self.pname, 'b_zr')) self.b_h = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_h')) self.W_hzr = multi_orth(rng=self.rng, size=shape_hh2, name=_p(self.pname, 'W_hzr')) self.W_hh = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_hh')) self.params += [self.W_xzr, self.W_xh, self.W_hzr, self.W_hh, self.b_zr, self.b_h] if self.with_context: shape_ch = (self.c_hids, self.n_hids) self.W_cz = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_cz')) self.W_cr = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_cr')) self.W_ch = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_ch')) self.W_c_init = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_c_init')) self.params += [self.W_cz, self.W_cr, self.W_ch, self.W_c_init] if self.with_begin_tag: self.struct_begin_tag = constant_weight(shape=(self.n_hids,), value=0., name=_p(self.pname, 'struct_begin_tag')) self.params += [self.struct_begin_tag] if self.with_end_tag: self.struct_end_tag = constant_weight(shape=(self.n_in,), value=0., name=_p(self.pname, 'struct_end_tag')) self.params += [self.struct_end_tag] if self.n_att_ctx: self.gru_combine_ctx_h = GRU(self.n_att_ctx, self.n_hids, rng=self.rng, name=_p(self.pname, 'gru_combine_ctx_h')) self.params.extend(self.gru_combine_ctx_h.params) self.attention = ATTENTION(self.n_hids, self.rng, name=_p(self.pname, 'att_ctx')) self.params.extend(self.attention.params)
def _init_params(self): shape_hh = (self.n_hids, self.n_hids) self.W_comb_att = norm_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_comb_att')) self.U_att = norm_weight(rng=self.rng, shape=(self.n_hids, 1), name=_p(self.pname, 'U_att')) self.c_att = constant_weight(shape=(1,), name=_p(self.pname, 'c_att')) self.params = [self.W_comb_att, self.U_att, self.c_att]
def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs): """ Feedforward pass """ return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')])
def param_init_gru_cond(options, params, prefix='gru_cond', nin=None, dim=None, dimctx=None, nin_nonlin=None, dim_nonlin=None): if nin is None: nin = options['dim'] if dim is None: dim = options['dim'] if dimctx is None: dimctx = options['dim'] if nin_nonlin is None: nin_nonlin = nin if dim_nonlin is None: dim_nonlin = dim W = np.concatenate([norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W params[_p(prefix, 'b')] = np.zeros((2 * dim,)).astype('float32') U = np.concatenate([ortho_weight(dim_nonlin), ortho_weight(dim_nonlin)], axis=1) params[_p(prefix, 'U')] = U Wx = norm_weight(nin_nonlin, dim_nonlin) params[_p(prefix, 'Wx')] = Wx Ux = ortho_weight(dim_nonlin) params[_p(prefix, 'Ux')] = Ux params[_p(prefix, 'bx')] = np.zeros((dim_nonlin,)).astype('float32') U_nl = np.concatenate([ortho_weight(dim_nonlin), ortho_weight(dim_nonlin)], axis=1) params[_p(prefix, 'U_nl')] = U_nl params[_p(prefix, 'b_nl')] = np.zeros((2 * dim_nonlin,)).astype('float32') Ux_nl = ortho_weight(dim_nonlin) params[_p(prefix, 'Ux_nl')] = Ux_nl params[_p(prefix, 'bx_nl')] = np.zeros((dim_nonlin,)).astype('float32') # context to LSTM Wc = norm_weight(dimctx, dim * 2) params[_p(prefix, 'Wc')] = Wc Wcx = norm_weight(dimctx, dim) params[_p(prefix, 'Wcx')] = Wcx # attention: combined -> hidden W_comb_att = norm_weight(dim, dimctx) params[_p(prefix, 'W_comb_att')] = W_comb_att # attention: context -> hidden Wc_att = norm_weight(dimctx) params[_p(prefix, 'Wc_att')] = Wc_att # attention: hidden bias b_att = np.zeros((dimctx,)).astype('float32') params[_p(prefix, 'b_att')] = b_att # attention: U_att = norm_weight(dimctx, 1) params[_p(prefix, 'U_att')] = U_att c_att = np.zeros((1,)).astype('float32') params[_p(prefix, 'c_tt')] = c_att return params