def convolve1d_4D_scan(input, W, mode='full'): batch_size, nchannels, nwords, ndim = input.shape nkernels_out, nkernels_in, filter_width, ndim = W.shape # Unroll filter along columns W_unrolled = W.dimshuffle(0, 2, 1, 3).flatten(ndim=3) # Replicate input filters 'batch_size' times and squash out_filters along column axis. # W_tiled = T.tile(W_unrolled, (1, 1, batch_size)).dimshuffle(1, 0, 2).flatten(ndim=2) # doesn't give a gradient W_tiled = T.alloc(W_unrolled, batch_size, W_unrolled.shape[0], W_unrolled.shape[1], W_unrolled.shape[2]).dimshuffle(1, 2, 0, 3).flatten(ndim=3).dimshuffle(1, 0, 2).flatten(ndim=2) W_tiled = W_tiled[::-1] # reverse_slicing = [slice(None, None, None)] * W_tiled.ndim # reverse_slicing[0] = slice(None, None, -1) # reverse_slicing = tuple(reverse_slicing) # W_tiled = W_tiled[reverse_slicing] # flip the kernel # Unroll input and pad to fit the output filters. input_reshaped = input.dimshuffle(0, 2, 1, 3).flatten(ndim=3).dimshuffle(1,0,2).flatten(ndim=2) # input_tiled = T.tile(input_reshaped, (1, nkernels_out)) input_tiled = T.alloc(input_reshaped, nkernels_out, input_reshaped.shape[0], input_reshaped.shape[1]).dimshuffle(1, 0, 2).flatten(ndim=2) if mode == 'full': pad = T.zeros((filter_width-1, nkernels_out*batch_size*nchannels*ndim)) input_padded = T.concatenate([pad, input_tiled, pad]) conv_out, _ = theano.scan(fn=lambda i: (W_tiled * input_padded[i:i+filter_width]).sum(axis=0), outputs_info=None, sequences=[T.arange(0, nwords+filter_width-1)]) new_shape = (nwords+filter_width-1, nkernels_out, batch_size, nkernels_in, ndim) elif mode == 'valid': conv_out, _ = theano.scan(fn=lambda i: (W_tiled * input_tiled[i:i+filter_width]).sum(axis=0), outputs_info=None, sequences=[T.arange(0, nwords-filter_width+1)]) new_shape = (nwords-filter_width+1, nkernels_out, batch_size, nkernels_in, ndim) conv_reshaped = conv_out.reshape(new_shape).dimshuffle(2, 1, 0, 3, 4).sum(axis=3) return conv_reshaped
def apply(self , src , mask_length , tgt): """ viterbi algorithm """ result , updates = theano.scan( fn = self.train_step, sequences = src, outputs_info = [self.A_start, None] , non_sequences = self.A , n_steps = mask_length ) # the score of best path best_path_score = result[0][-1].max() idx = T.argmax(result[0][-1]) #backtracking res2 , _ = theano.scan( fn = lambda dps , idx , idx2 : [dps[idx] , idx], sequences = result[1][::-1], outputs_info = [idx , idx], n_steps = mask_length ) # the path of best score best_path = res2[1] #if len(best_path) < seq_len: # best_path.extend((seq_len - len(best_path)) * [2]) # the score of tgt path tgt_score = self.decode(src , mask_length , tgt) # max_margin max_margin = T.sum(T.neq(tgt[:mask_length] , best_path)) cost = best_path_score + max_margin - tgt_score return T.switch(T.lt(cost , T.alloc(numpy.float32(0.))) , T.alloc(numpy.float32(0.)) , cost ),best_path
def define_complete_network(self): """Sets connections for predicting all values given all inputs""" def step(htm1_f, htm1_b): y_t = self.activation[-1](T.dot(htm1_f, self.W_out_f) + T.dot(htm1_b, self.W_out_b) + self.b) return y_t padding_f = T.alloc(0, 1, self.forward_rnn.h.shape[1], self.forward_rnn.h.shape[2]) padding_b = T.alloc(0, 1, self.backward_rnn.h.shape[1], self.backward_rnn.h.shape[2]) self.y_t, _ = theano.scan(step, sequences=[T.concatenate([padding_f, self.forward_rnn.h[:-1]], axis=0), T.concatenate([self.backward_rnn.h[-2::-1], padding_b], axis=0)], outputs_info=None) self.L1 = abs(self.W_out_f.sum()) + abs(self.W_out_b.sum()) + \ self.forward_rnn.L1 + self.backward_rnn.L1 # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = (self.W_out_f ** 2).sum() + (self.W_out_b ** 2).sum() + \ self.forward_rnn.L2_sqr + self.backward_rnn.L2_sqr self.predict = theano.function( inputs=[self.x], outputs=self.y_t) self.complete_defined = True
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): ''' LSTM计算的核心,首先得注意参数state_below,这是个3D矩阵,[n_Step,BatchSize,Emb_Dim] [句子数,[单词batch数,词向量维度] ] ''' nsteps = state_below.shape[0] # 最高维 if state_below.ndim == 3: n_samples = state_below.shape[1] # 取出单词batch数 else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_): # x_是形参,下面的state_below是实参 # _step中的四个形参: x_是state_below降为二维矩阵形成的序列,m_是mask降为1维“行”向量的序列 preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) # 每次新的h_与lstm_U矩阵相乘,使得f、o、c均不再为零,其中f、o是中间变量 preact += x_ # 2维矩阵序列(x_) + 2维矩阵 = 2维矩阵序列 # 每一个preact矩阵序列 的4块并列子矩阵 进行切片分块运算 i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) # 拿出了preact中的input GATE相关项做sigmoid激活 f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) # forget GATE o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) # output GATE c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) # cell c = f * c_ + i * c # [:,None] 表示“行数组”升一维变为“列向量” c = m_[:, None] * c + (1. - m_)[:, None] * c_ #c_代表初始时刻或上一时刻 # 每个Step里,h结果是一个2D矩阵,[BatchSize,Emb_Dim] h = o * tensor.tanh(c) # 相当于octave中的.* h = m_[:, None] * h + (1. - m_)[:, None] * h_ # h_是h的上一时刻或toutputs_info中的初始时刻的值 return h, c # 输出值对应着outputs_info中的元素 # scan函数最终的return时,2维矩阵序列还原为三维矩阵,向量序列还原为2维矩阵,即scan函数的输出结果会增加1-D state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) # 3维矩阵 乘 2维矩阵仍是3维矩阵,无须每一个Step做一次Wx+b,而是把所有Step的Wx一次性预计算好了 dim_proj = options['dim_proj'] # scan函数的一旦sequence不为空,就进入序列循环模式 # theano.scan中的sequences中的张量会被降低一维成为在迭代函数中使用,原最高维的维数作为sequences的迭代的次数,out # 在scan函数的Sequence里,每步循环,都会降解n_Step维,得到一个Emb矩阵,作为输入X_ rval, updates = theano.scan(_step, sequences=[mask, state_below], # mask对应m_,state_below对应x_,迭代次数由sequences的序列个数决定 outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, # n_samples是句子中单词batch个数(即LSTM时序上输入的个数) dim_proj), # 这个张量是outputs[-1]时刻所初始化的值,在第一次loop之后(outputs[0]之后)将会被覆盖,覆盖后此处对应着_step中的h_(h的前一时刻) tensor.alloc(numpy_floatX(0.), n_samples, dim_proj) # 第1次loop后此张量被覆盖,此处对应c_(c的前一时刻) ], name=_p(prefix, '_layers'), n_steps=nsteps) return rval[0] # rval[0]是h rval[1]是c 它们都是tensor.shared类型
def apply(self, x): W, U, b = self.params ndim = self.ndim def _slice(x, n, dim): return x[:, n * dim:(n + 1) * dim] def _step(x_t, h_t, c_t): preact = T.dot(h_t, U) + x_t i = T.nnet.sigmoid(_slice(preact, 0, self.ndim)) f = T.nnet.sigmoid(_slice(preact, 1, self.ndim)) o = T.nnet.sigmoid(_slice(preact, 2, self.ndim)) c = T.tanh(_slice(preact, 3, self.ndim)) c = f * c_t + i * c h = o * T.tanh(c) return h, c state_below = T.dot(x, W) + b rval, _ = theano.scan( _step, [state_below], outputs_info = [T.alloc(numpy.float32(0.), x.shape[1], ndim), T.alloc(numpy.float32(0.), x.shape[1], ndim)], profile = _doProfile) return rval[0]
def init(sequence_length): initial_V = T.alloc(np.float32(0), sequence_length, size) initial_s = T.alloc(np.float32(0), sequence_length) def step(t, v, d, u, prev_V, prev_s): prev_V_to_t = prev_V[:t] prev_s_to_t = prev_s[:t] V = T.concatenate([ prev_V_to_t, v.dimshuffle('x', 0), initial_V[t + 1:] ]) to_flip = rectify(u - rev_cumsum(prev_s[1:t+1])) new_s = rectify(prev_s_to_t - to_flip) s = T.concatenate([ new_s, d.dimshuffle('x'), initial_s[t + 1:] ]) flip_score = rectify(1 - rev_cumsum(s[1:t+1])) score = T.min([new_s, flip_score], axis=0) r = T.dot(score, prev_V_to_t) + d * v return V, s, r return initial_V, initial_s, step
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None): nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj'])) f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj'])) o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj'])) c = tensor.tanh(_slice(preact, 3, options['dim_proj'])) if has_input_gate: if has_forget_gate: c = f * c_ + i * c else: c = c_ + i*c else: if has_forget_gate: c = f*c_ + c else: c = c_ + c c = m_[:, None] * c + (1. - m_)[:, None] * c_ if has_output_gate: h = o * tensor.tanh(c) else: h = tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) dim_proj = options['dim_proj'] rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, dim_proj), tensor.alloc(numpy_floatX(0.), n_samples, dim_proj)], name=_p(prefix, '_layers'), n_steps=nsteps) return rval[0]
def calc_lstm(self, input, mask): def _slice(_x, n, dim): return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_): preact = T.dot(h_, self.U) preact += x_ i = T.nnet.sigmoid(_slice(preact, 0, self.n_hidden)) f = T.nnet.sigmoid(_slice(preact, 1, self.n_hidden)) o = T.nnet.sigmoid(_slice(preact, 2, self.n_hidden)) c = T.tanh(_slice(preact, 3, self.n_hidden)) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * T.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c n_samples = input.shape[1] wx = T.dot(input, self.W) + self.b rval, updates = theano.scan(_step, sequences=[mask, wx], outputs_info=[T.alloc(numpy.asarray(0., dtype=numpy.float64), n_samples, self.n_hidden), T.alloc(numpy.asarray(0., dtype=numpy.float64), n_samples, self.n_hidden)]) return rval[0]
def output_probabilistic(self, m_w_previous, v_w_previous): if (self.non_linear): m_in = self.m_w - m_w_previous v_in = self.v_w # We compute the mean and variance after the ReLU activation lam = self.lam v_1 = 1 + 2*lam*v_in v_1_inv = v_1**-1 s_1 = T.prod(v_1,axis=1)**-0.5 v_2 = 1 + 4*lam*v_in v_2_inv = v_2**-1 s_2 = T.prod(v_2,axis=1)**-0.5 v_inv = v_in**-1 exponent1 = m_in**2*(1 - v_1_inv)*v_inv exponent1 = T.sum(exponent1,axis=1) exponent2 = m_in**2*(1 - v_2_inv)*v_inv exponent2 = T.sum(exponent2,axis=1) m_a = s_1*T.exp(-0.5*exponent1) v_a = s_2*T.exp(-0.5*exponent2) - m_a**2 return (m_a, v_a) else: m_w_previous_with_bias = \ T.concatenate([ m_w_previous, T.alloc(1, 1) ], 0) v_w_previous_with_bias = \ T.concatenate([ v_w_previous, T.alloc(0, 1) ], 0) m_linear = T.dot(self.m_w, m_w_previous_with_bias) / T.sqrt(self.n_inputs) v_linear = (T.dot(self.v_w, v_w_previous_with_bias) + \ T.dot(self.m_w**2, v_w_previous_with_bias) + \ T.dot(self.v_w, m_w_previous_with_bias**2)) / self.n_inputs return (m_linear, v_linear)
def mf(self, V, Y = None, return_history = False, niter = None, block_grad = None): drop_mask = T.zeros_like(V) if Y is not None: drop_mask_Y = T.zeros_like(Y) else: batch_size = V.shape[0] num_classes = self.dbm.hidden_layers[-1].n_classes assert isinstance(num_classes, int) Y = T.alloc(1., V.shape[0], num_classes) drop_mask_Y = T.alloc(1., V.shape[0]) history = self.do_inpainting(X=V, Y=Y, return_history=True, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y, noise=False, niter=niter, block_grad=block_grad) if return_history: return [elem['H_hat'] for elem in history] return history[-1]['H_hat']
def crop_images(data, image_shape, border_width=8, mode=0): """ Function used to crop the images by a certain border width. data : input data, theano 4D tensor image_shape : 4-tuple, (batch_size, num_channels, image_rows, image_cols) border_width : border width to be cropped, default value 8 mode : binary, 0 for random, 1 for centered crop. """ if (mode == 0): row_step = image_shape[2] - border_width col_step = image_shape[3] - border_width output = T.alloc(0., image_shape[0], image_shape[1], row_step, col_step) for i in range(image_shape[0]): begin_idx = numpy.random.randint(border_width) output = T.set_subtensor(output[i,:,:,:], data[i,:,begin_idx:(begin_idx+row_step),begin_idx:(begin_idx+col_step)]) return output else: row_step = image_shape[2] - border_width col_step = image_shape[3] - border_width output = T.alloc(0., image_shape[0], image_shape[1], row_step, col_step) for i in range(image_shape[0]): begin_idx = border_width / 2 output = T.set_subtensor(output[i,:,:,:], data[i,:,begin_idx:(begin_idx+row_step),begin_idx:(begin_idx+col_step)]) return output
def build(self, antialias_samples=4): # returns top-level render function and associated variables image = T.alloc(0., self.camera.x_dims, self.camera.y_dims, 3) #Anti-Aliasing sampleDist_x = np.asarray(np.random.random((self.camera.x_dims, self.camera.y_dims,antialias_samples)),dtype=theano.config.floatX) sampleDist_y = np.asarray(np.random.random((self.camera.x_dims, self.camera.y_dims,antialias_samples)),dtype=theano.config.floatX) for sample in xrange(antialias_samples): #TODO USE SCAN #Make Rays self.camera.rays = self.camera.make_rays(self.camera.x_dims, self.camera.y_dims,\ sampleDist_x=(sampleDist_x[:,:,sample] + sample)/antialias_samples, sampleDist_y=(sampleDist_y[:,:,sample] + sample)/antialias_samples) #self.camera.variables.add_child(self.camera.rays.variables) image_per_sample = T.alloc(0.0, self.camera.x_dims, self.camera.y_dims, 3) min_dists = T.alloc(float('inf'), self.camera.x_dims, self.camera.y_dims) # for each shape find its shadings and draw closer shapes on top for shape in self.shapes: dists = shape.distance(self.camera.rays) shadings = self.shader.shade(shape, self.lights, self.camera) #for each shape != obj, draw shadow of shape on obj #for obj2 in self.shapes: # if obj == obj2: continue # shadings = broadcasted_switch(obj2.shadow( # obj.surface_pts(self.camera.rays), self.lights) < 0, shadings, [0., 0., 0.]) image_per_sample = broadcasted_switch(dists < min_dists, shadings, image_per_sample) min_dists = T.switch(dists < min_dists, dists, min_dists) image = image + image_per_sample image = image / antialias_samples return image
def RNN_layer(tparams,inputs,mask=None,init_h=None,prefix=None,name='rnn',std=True): """ inputs: n_steps*n_samples*x_size return h """ prefix=GetPrefix(prefix,name); # if length!=None: inputs=inputs[index:index+length,:,:]; n_steps=inputs.shape[0]; n_samples=inputs.shape[1]; x_size=inputs.shape[2]; hdim=tparams[_p(prefix,'wh')].shape[0]; if mask == None: mask = T.alloc(1., n_steps, n_samples); if init_h == None: init_h = T.alloc(0., n_samples, hdim); def _step(m,x,h): inputs_h=( T.dot(x,tparams[_p(prefix,'wx')])+T.dot(h,tparams[_p(prefix,'wh')]) )/2+tparams[_p(prefix,'b')]; _h=tanh(inputs_h); return _h; if std: inputs=standardize(inputs); out,updates=theano.scan(lambda m,x,h:_step(m,x,h), sequences=[mask,inputs], outputs_info=[init_h], name=_p(prefix,'scan'), n_steps=n_steps, # truncate_gradient=10, profile=False); return out
def __init__(self, cell, rng, layer_id, shape, X, mask, is_train = 1, batch_size = 1, p = 0.5): prefix = "SentDecoderLayer_" layer_id = "_" + layer_id self.in_size, self.out_size = shape self.X = X self.summs = batch_size self.W_hy = init_weights((self.in_size, self.out_size), prefix + "W_hy" + layer_id) self.b_y = init_bias(self.out_size, prefix + "b_y" + layer_id) if cell == "gru": self.decoder = GRULayer(rng, prefix + layer_id, shape, self.X, mask, is_train, 1, p) def _active(pre_h, x): h = self.decoder._active(x, pre_h) y = T.tanh(T.dot(h, self.W_hy) + self.b_y) return h, y [h, y], updates = theano.scan(_active, n_steps = self.summs, sequences = [], outputs_info = [{'initial':self.X, 'taps':[-1]}, T.alloc(floatX(0.), 1, self.out_size)]) elif cell == "lstm": self.decoder = LSTMLayer(rng, prefix + layer_id, shape, self.X, mask, is_train, 1, p) def _active(pre_h, pre_c, x): h, c = self.decoder._active(x, pre_h, pre_c) y = T.tanh(T.dot(h, self.W_hy) + self.b_y) return h, c, y [h, c, y], updates = theano.scan(_active, n_steps = self.summs, sequences = [], outputs_info = [{'initial':self.X, 'taps':[-1]}, {'initial':self.X, 'taps':[-1]}, T.alloc(floatX(0.), 1, self.out_size)]) y = T.reshape(y, (self.summs, self.out_size)) self.activation = y self.params = self.decoder.params + [self.W_hy, self.b_y]
def gru_layer(tparams, state_below, init_state, options, prefix='gru', mask=None, **kwargs): """ Feedforward pass through GRU """ nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 dim = tparams[_p(prefix,'Ux')].shape[1] if init_state == None: init_state = tensor.alloc(0., n_samples, dim) if mask == None: mask = tensor.alloc(1., state_below.shape[0], 1) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] U = tparams[_p(prefix, 'U')] Ux = tparams[_p(prefix, 'Ux')] def _step_slice(m_, x_, xx_, h_, U, Ux): preact = tensor.dot(h_, U) preact += x_ r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) preactx = tensor.dot(h_, Ux) preactx = preactx * r preactx = preactx + xx_ h = tensor.tanh(preactx) h = u * h_ + (1. - u) * h h = m_[:,None] * h + (1. - m_)[:,None] * h_ return h seqs = [mask, state_below_, state_belowx] _step = _step_slice rval, updates = theano.scan(_step, sequences=seqs, outputs_info = [init_state], non_sequences = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]], name=_p(prefix, '_layers'), n_steps=nsteps, profile=False, strict=True) rval = [rval] return rval
def symbolic_lstm(input, W, b, n_hidden, input_layer, init_hidden=None, prefix="lstm"): def _slice(_x, n, dim): return _x[n*dim:(n+1) * dim] def _step(x_, h_, c_): preact = tensor.dot(tensor.concatenate((h_, input_layer(x_, h_))), W) preact += b i = nnet.sigmoid(_slice(preact, 0, n_hidden)) f = nnet.sigmoid(_slice(preact, 1, n_hidden)) o = nnet.sigmoid(_slice(preact, 2, n_hidden)) c = nnet.sigmoid(_slice(preact, 3, n_hidden)) c = f * c_ + i * c h = o * tensor.tanh(c) return h, c if init_hidden is None: init_hidden = tensor.alloc(numpy_floatX(0.), n_hidden) rval, updates = theano.scan(_step, sequences=[input], outputs_info=[init_hidden, tensor.alloc(numpy_floatX(0.), n_hidden)], name=_p(prefix, '_layers')) return rval[0]
def encode(self, state_below): """ :development: (1) may need to prepend encoding_length * padding array to the state_below to produce the same length sequence as state_below (2) can return an offset encoding by only returing certain indices of the encoding (though this is pretty wasteful) :type state_below: 2d tensor :param state_below: the enitre sequence of states from the layer below the current one :type rval: 2d tensor :param rval: an encoding of the state_below (the entire sequence of state) to be passed to the above layer """ total_sequence_length = T.cast(state_below.shape[0], theano.config.floatX) self.n_encodings = T.cast(T.ceil(total_sequence_length / self.encoding_length), 'int32') self.n_padding_timesteps = T.cast(self.n_encodings * self.encoding_length - total_sequence_length, 'int32') zeros = T.alloc(np.cast[theano.config.floatX](0), self.n_padding_timesteps, self.n_vis) state_below = T.concatenate((zeros, state_below)) Wxh = self.Wxh bxh = self.bxh Whhe = self.Whhe state_below = state_below.reshape((self.encoding_length, self.n_encodings, self.n_vis)) state_below = T.dot(state_below, Wxh) + bxh # a single output will be n_encoding rows with n_hid features each encoding_0 = T.alloc(np.cast[theano.config.floatX](0), self.n_encodings, self.n_hid) encodings, updates = scan(fn=self.encode_step, sequences=[state_below], outputs_info=[encoding_0], non_sequences=[Whhe]) # encodings is a 3d vector (encoding_length, n_encodings, n_hid) # returns encodings[-1] in 2d vector shape = (n_encodings, n_hid) return encodings[-1]
def layer_output(self, state_blow, tag_blow, mask=None): """ :type tag_blow: object """ nsteps = state_blow.shape[0] if state_blow.ndim == 3: nsamples = state_blow.shape[1] else: nsamples = 1 # assert assert mask is not None state_blow = tensor.dot(state_blow, self.w) + tensor.dot(tag_blow, self.v)+ self.b results, updates = theano.scan( fn=self._step, sequences=[mask, state_blow], outputs_info=[tensor.alloc(numpy_floatX(0.), nsamples, self.mem_dim), tensor.alloc(numpy_floatX(0.), nsamples, self.mem_dim)], n_steps=nsteps, name=self.name + '_layer' ) return results[0]
def lstm_function(state_below, n_hidden, W, U, b, prefix="lstm", truncate_gradient=-1): def _slice(_x, n, dim): return _x[n*dim:(n+1) * dim] def _step(x_, h_, c_): preact = tensor.dot(h_, U) preact += x_ i = nnet.sigmoid(_slice(preact, 0, n_hidden)) f = nnet.sigmoid(_slice(preact, 1, n_hidden)) o = nnet.sigmoid(_slice(preact, 2, n_hidden)) c = tensor.tanh(_slice(preact, 3, n_hidden)) c = f * c_ + i * c h = o * tensor.tanh(c) return h, c init_hidden = tensor.alloc(numpy_floatX(0.), n_hidden) state_below = tensor.dot(state_below, W) + b rval, updates = theano.scan(_step, sequences=[state_below], outputs_info=[init_hidden, tensor.alloc(numpy_floatX(0.), n_hidden)], name=_p(prefix, '_layers'), truncate_gradient=truncate_gradient) return rval[0]
def fprop(self, data): if self.use_ground_truth: self.input_space.validate(data) features, phones = data init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda f, p, h, o: self.fprop_step(f, p, h, o) ((h, out), updates) = theano.scan(fn=fn, sequences=[features, phones], outputs_info=[dict(initial=init_h, taps=[-1]), init_out]) return out else: self.input_space.validate(data) features, phones = data init_in = features[0] init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda t, p, f, h, o: self.fprop_step_prime(t, p, f, h, o) ((f, h, out), updates) = theano.scan(fn=fn, sequences=[features, phones], outputs_info=[init_in, dict(initial=init_h, taps=[-1]), init_out]) return out
def outputs_info(self, n_samples): # initialize hidden states: c, h shape = (n_samples,) + self.output_shape return [ T.unbroadcast(T.alloc(numpy.asarray(0., dtype=theano.config.floatX), *shape), *range(len(shape))), # c T.unbroadcast(T.alloc(numpy.asarray(0., dtype=theano.config.floatX), *shape), *range(len(shape))) # h ]
def encode_lstm(self, x, mask): def _step(m_tm_1, x_t, h_tm_1, c_tm_1): lstm_preactive = T.dot(h_tm_1, self.encode_U) + \ T.dot(x_t, self.encode_W) + \ self.encode_b i = T.nnet.sigmoid(lstm_preactive[:,0:self.hidden_dim]) f = T.nnet.sigmoid(lstm_preactive[:,self.hidden_dim:self.hidden_dim*2]) o = T.nnet.sigmoid(lstm_preactive[:,self.hidden_dim*2:self.hidden_dim*3]) c = T.tanh(lstm_preactive[:,self.hidden_dim*3:self.hidden_dim*4]) c = f*c_tm_1 + i*c c = m_tm_1[:,None]*c + (1.-m_tm_1)[:,None]*c_tm_1 h = o*T.tanh(c) h = m_tm_1[:,None]*h + (1.-m_tm_1)[:,None]*h_tm_1 return [h,c] h0 = T.alloc(0., x.shape[1], self.hidden_dim) c0 = T.alloc(0., x.shape[1], self.hidden_dim) rval, updates = theano.scan( fn=_step, sequences=[mask,x], outputs_info=[h0,c0] ) h_list, c_list = rval return h_list
def encode(self, state_below): """ :development: (1) may need to prepend encoding_length * padding array to the state_below to produce the same length sequence as state_below (2) can return an offset encoding by only returing certain indices of the encoding (though this is pretty wasteful) :type state_below: 2d tensor :param state_below: the enitre sequence of states from the layer below the current one :type rval: 2d tensor :param rval: an encoding of the state_below (the entire sequence of state) to be passed to the above layer """ # to make the encodings start with the first state in state_below, prepend encoding_length vectors of value zero zeros = T.alloc(np.cast[theano.config.floatX](0), self.encoding_length - 1, self.n_hid) state_below = T.concatenate((zeros, state_below)) encoding_0 = T.alloc(np.cast[theano.config.floatX](0), self.n_hid) # negative, reverse indicies for the taps # e.g., [-4, -3, -2, -1, -0] would pass those indicies from state_below to the encode_step taps = [-1 * tap for tap in range(self.encoding_length)[::-1]] encodings, updates = scan( fn=self.encode_subsequence, sequences=dict(input=state_below, taps=taps), outputs_info=[encoding_0] ) return encodings
def arc_distance_theano_alloc_prepare(dtype='float64'): """ Calculates the pairwise arc distance between all points in vector a and b. """ a = tensor.matrix(dtype=str(dtype)) b = tensor.matrix(dtype=str(dtype)) # Theano don't implement all case of tile, so we do the equivalent with alloc. #theta_1 = tensor.tile(a[:, 0], (b.shape[0], 1)).T theta_1 = tensor.alloc(a[:, 0], b.shape[0], b.shape[0]).T phi_1 = tensor.alloc(a[:, 1], b.shape[0], b.shape[0]).T theta_2 = tensor.alloc(b[:, 0], a.shape[0], a.shape[0]) phi_2 = tensor.alloc(b[:, 1], a.shape[0], a.shape[0]) temp = (tensor.sin((theta_2 - theta_1) / 2)**2 + tensor.cos(theta_1) * tensor.cos(theta_2) * tensor.sin((phi_2 - phi_1) / 2)**2) distance_matrix = 2 * (tensor.arctan2(tensor.sqrt(temp), tensor.sqrt(1 - temp))) name = "arc_distance_theano_alloc" rval = theano.function([a, b], distance_matrix, name=name) rval.__name__ = name return rval
def sample(self, x0=None, h0=None, c0=None, n_samples=10, n_steps=10, condition_on=None, debug=False): if x0 is None: x0, _ = self.output_net.sample( p=T.constant(0.5).astype(floatX), size=(n_samples, self.output_net.dim_out)).astype(floatX) if h0 is None: h0 = T.alloc(0., x0.shape[0], self.dim_h).astype(floatX) if c0 is None: c0 = T.alloc(0., x0.shape[0], self.dim_h).astype(floatX) z0 = self.output_net.preact(h0) seqs = [] outputs_info = [h0, c0, x0, None] non_seqs = [] step = self.step_sample p0 = self.output_net.distribution(z0) non_seqs += self.get_sample_params() if debug: return self.step_sample(h0, x0, *self.get_sample_params()) outs = scan(step, seqs, outputs_info, non_seqs, n_steps, name=self.name+'_sampling', strict=False) (h, c, x, p), updates = outs x = concatenate([x0[None, :, :], x]) h = concatenate([h0[None, :, :], h]) p = concatenate([p0[None, :, :], p]) return OrderedDict(x=x, p=p, h=h, x0=x0, p0=p0, h0=h0), updates
def __call__(self, input): nh = self.hidden_size # _in: input of t # _m : output of t - 1 # _c : memory of t - 1 def _step(_in, _m, _c, nh): _x = tensor.concatenate([numpy.asarray([1.], dtype=numpy.float32), _in, _m]) ifog = tensor.dot(_x, self.W) i = tensor.nnet.sigmoid(ifog[ : nh]) f = tensor.nnet.sigmoid(ifog[nh : 2*nh]) o = tensor.nnet.sigmoid(ifog[2*nh : 3*nh]) g = tensor.tanh(ifog[3*nh : ]) _c = f * _c + i * g _m = o * _c return _m, _c self._step = _step results, update = theano.scan( _step, sequences=[input], outputs_info=[tensor.alloc(0.0, nh), tensor.alloc(0.0, nh)], non_sequences=[self.hidden_size] ) return results[0] #(_m_list, _c_list)[0]
def decode(self, hidden): hidden_ = T.alloc(0.,*self.hidden_shape) deconv_out = T.alloc(0.,*self.output_shape) # Zero padding How can I code easily? hidden_ = T.set_subtensor(hidden_[:,:,:,self.filter_shape[3]-1:],hidden) # Calculate output conv_odd = conv.conv2d( input = hidden_, filters = self.W_odd, filter_shape = self.filter_shape, image_shape = self.hidden_shape,) conv_even = conv.conv2d( input = hidden_, filters = self.W_even, filter_shape = self.filter_shape, image_shape = self.hidden_shape,) deconv_out = T.set_subtensor(deconv_out[:,:,:,::2], conv_odd) deconv_out = T.set_subtensor(deconv_out[:,:,:,1::2], conv_even) linout = deconv_out + self.b.dimshuffle('x',0,'x','x') if self.dec_hid == 'tanh': convout= T.tanh(linout) elif self.dec_hid == 'lin': convout=linout elif self.dec_hid == 'relu': convout=linout * (linout > 0.) + 0. * (linout < 0.) else: raise ValueError('Invalid dec_hid') #### Recurrent connection#### return convout
def generate_lstm(self, context): x0 = T.alloc(0., context.shape[0], self.embedding_dim) h0 = T.alloc(0., context.shape[0], self.hidden_dim) c0 = T.alloc(0., context.shape[0], self.hidden_dim) def _step(x_tm_1, h_tm_1, c_tm_1): lstm_preactive = T.dot(h_tm_1, self.decode_U)+ \ T.dot(context, self.decode_V)+ \ T.dot(x_tm_1, self.decode_W) + \ self.decode_b i = T.nnet.sigmoid(lstm_preactive[:,0:self.hidden_dim]) f = T.nnet.sigmoid(lstm_preactive[:,self.hidden_dim:self.hidden_dim*2]) o = T.nnet.sigmoid(lstm_preactive[:,self.hidden_dim*2:self.hidden_dim*3]) c = T.tanh(lstm_preactive[:,self.hidden_dim*3:self.hidden_dim*4]) c = f*c_tm_1 + i*c h = o*T.tanh(c) x_emb = T.dot(h, self.output_W) + self.output_b # (n_samples, embedding_dim) x_word = T.dot(x_emb, self.word_W) + self.word_b # (n_samples, n_words) x_index = T.argmax(x_word, axis=1) x = self.emb[x_index] return [x,h,c] rval, updates = theano.scan( fn=_step, outputs_info=[x0,h0,c0], n_steps=20) generated_sequence = rval[0] return generated_sequence
def ENCODER_R(X, tparams, options): # (tensor.alloc(numpy_floatX(1.), options['hidden_size'], 1)-tensor.nnet.sigmoid(tensor.dot(tparams['Wr_Z'], xr) + tensor.dot(tparams['Ur_Z'], hr_tm1))) * hr_tm1\ # + tensor.nnet.sigmoid(tensor.dot(tparams['Wr_Z'], xr) + tensor.dot(tparams['Ur_Z'], hr_tm1)) * tensor.tanh(\ # tensor.dot(tparams['Wr'], xr) + tensor.dot(tparams['Ur'], \ # (tensor.nnet.sigmoid(tensor.dot(tparams['Wr_R'], xr) + \ # tensor.dot(tparams['Ur_R'], hr_tm1)) * hr_tm1)\ # )\ # ) # (tensor.alloc(numpy_floatX(1.), options['hidden_size'])-tensor.nnet.sigmoid(tensor.dot\ # (tparams["Emb"][xr], tparams['Wr_Z']) + tensor.dot(hr_tm1, tparams['Ur_Z']))) * hr_tm1\ # + tensor.nnet.sigmoid(tensor.dot(tparams["Emb"][xr], tparams['Wr_Z']) + tensor.dot(hr_tm1, \ # tparams['Ur_Z'])) * tensor.tanh(tensor.dot(tparams["Emb"][xr], tparams['Wr']) + \ # tensor.dot((tensor.nnet.sigmoid(tensor.dot(tparams["Emb"][xr], tparams['Wr_R']) + tensor\ # .dot(hr_tm1, tparams['Ur_R'])) * hr_tm1) , tparams['Ur']))\ # tparams["Emb"][xr] # X_Vec = word2VecLayer(X, tparams) results_r, updates = theano.scan(lambda xr, hr_tm1: (tensor.alloc(numpy_floatX(1.), options['hidden_size'])\ -tensor.nnet.sigmoid(tensor.dot(tparams["Emb"][xr], tparams['Wr_Z']) + tensor.dot(hr_tm1, tparams['Ur_Z']))) * hr_tm1\ + tensor.nnet.sigmoid(tensor.dot(tparams["Emb"][xr], tparams['Wr_Z']) + tensor.dot(hr_tm1, \ tparams['Ur_Z'])) * tensor.tanh(tensor.dot(tparams["Emb"][xr], tparams['Wr']) + \ tensor.dot((tensor.nnet.sigmoid(tensor.dot(tparams["Emb"][xr], tparams['Wr_R']) + tensor.\ dot(hr_tm1, tparams['Ur_R'])) * hr_tm1) , tparams['Ur']))\ , sequences=[X], outputs_info=tensor.alloc(numpy_floatX(0.), options['hidden_size'])) #initial value of the scan can only be vec return results_r # [hi_right] # return[ (n,) *l ] that is [(1*n) * l]
def __init__(self, input_var, layerid,sequence,n_input_channels=1, height=3,width=3,n_filters=8): X = input_var imH, imW = X.shape[-2],X.shape[-1] H, W, F, C = height, width, n_filters, n_input_channels Tt, N = input_var.shape[0],input_var.shape[1] self.n_filters = n_filters self.Wx = shared(self.glorot_init(H*W*F,4*F,C,H,W), name='Wx'+layerid) self.Wh = shared(self.glorot_init(4*H*W*F,4*F,F,H,W),name='Wh'+layerid) self.b = shared(np.zeros(4*F,dtype=np.float32),name='b'+layerid) self.params = { self.Wx.name: self.Wx, self.Wh.name: self.Wh, self.b.name: self.b } [h,c], _ = scan(self.step, sequences=[X], outputs_info=[ T.alloc(np.cast['float32'](0), N,F,imH,imW), T.alloc(np.cast['float32'](0), N,F,imH,imW) ]) self.output = h
def compute_Gv(*args): cgv = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name='cgv%d' % idx) for idx, shp in enumerate(model.params_shape) ] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const( options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [ TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:] ] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates=updates, givens=dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile=options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {}
def gru_cond_layer(tparams, state_below, options, prefix='gru', mask=None, context=None, one_step=False, init_memory=None, init_state=None, context_mask=None, emb_dropout=None, rec_dropout=None, ctx_dropout=None, pctx_=None, truncate_gradient=-1, profile=False, **kwargs): assert context, 'Context must be provided' if one_step: assert init_state, 'previous state must be provided' nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 # mask if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) dim = tparams[pp(prefix, 'Wcx')].shape[1] # initial/previous state if init_state is None: init_state = tensor.alloc(0., n_samples, dim) # projected context assert context.ndim == 3, 'Context must be 3-d: #annotation x #sample x dim' if pctx_ is None: pctx_ = tensor.dot(context*ctx_dropout[0], tparams[pp(prefix, 'Wc_att')]) +\ tparams[pp(prefix, 'b_att')] def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] # state_below is the previous output word embedding state_belowx = tensor.dot(state_below*emb_dropout[0], tparams[pp(prefix, 'Wx')]) +\ tparams[pp(prefix, 'bx')] state_below_ = tensor.dot(state_below*emb_dropout[1], tparams[pp(prefix, 'W')]) +\ tparams[pp(prefix, 'b')] def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_, rec_dropout, ctx_dropout, U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl): preact1 = tensor.dot(h_ * rec_dropout[0], U) preact1 += x_ preact1 = tensor.nnet.sigmoid(preact1) r1 = _slice(preact1, 0, dim) u1 = _slice(preact1, 1, dim) preactx1 = tensor.dot(h_ * rec_dropout[1], Ux) preactx1 *= r1 preactx1 += xx_ h1 = tensor.tanh(preactx1) h1 = u1 * h_ + (1. - u1) * h1 h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ # attention pstate_ = tensor.dot(h1 * rec_dropout[2], W_comb_att) pctx__ = pctx_ + pstate_[None, :, :] #pctx__ += xc_ pctx__ = tensor.tanh(pctx__) alpha = tensor.dot(pctx__ * ctx_dropout[1], U_att) + c_tt alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) alpha = tensor.exp(alpha - alpha.max(0, keepdims=True)) if context_mask: alpha = alpha * context_mask alpha = alpha / alpha.sum(0, keepdims=True) ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context preact2 = tensor.dot(h1 * rec_dropout[3], U_nl) + b_nl preact2 += tensor.dot(ctx_ * ctx_dropout[2], Wc) preact2 = tensor.nnet.sigmoid(preact2) r2 = _slice(preact2, 0, dim) u2 = _slice(preact2, 1, dim) preactx2 = tensor.dot(h1 * rec_dropout[4], Ux_nl) + bx_nl preactx2 *= r2 preactx2 += tensor.dot(ctx_ * ctx_dropout[3], Wcx) h2 = tensor.tanh(preactx2) h2 = u2 * h1 + (1. - u2) * h2 h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 return h2, ctx_, alpha.T # pstate_, preact, preactx, r, u seqs = [mask, state_below_, state_belowx] #seqs = [mask, state_below_, state_belowx, state_belowc] _step = _step_slice shared_vars = [ tparams[pp(prefix, 'U')], tparams[pp(prefix, 'Wc')], tparams[pp(prefix, 'W_comb_att')], tparams[pp(prefix, 'U_att')], tparams[pp(prefix, 'c_tt')], tparams[pp(prefix, 'Ux')], tparams[pp(prefix, 'Wcx')], tparams[pp(prefix, 'U_nl')], tparams[pp(prefix, 'Ux_nl')], tparams[pp(prefix, 'b_nl')], tparams[pp(prefix, 'bx_nl')] ] if one_step: rval = _step(*( seqs + [init_state, None, None, pctx_, context, rec_dropout, ctx_dropout ] + shared_vars)) else: rval, updates = theano.scan( _step, sequences=seqs, outputs_info=[ init_state, tensor.alloc(0., n_samples, context.shape[2]), tensor.alloc(0., n_samples, context.shape[0]) ], non_sequences=[pctx_, context, rec_dropout, ctx_dropout] + shared_vars, name=pp(prefix, '_layers'), n_steps=nsteps, truncate_gradient=truncate_gradient, profile=profile, strict=True) return rval
def gru_layer(tparams, state_below, options, prefix='gru', mask=None, emb_dropout=None, rec_dropout=None, truncate_gradient=-1, profile=False, **kwargs): nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 dim = tparams[pp(prefix, 'Ux')].shape[1] if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) # utility function to slice a tensor def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] # state_below is the input word embeddings # input to the gates, concatenated state_below_ = tensor.dot(state_below*emb_dropout[0], tparams[pp(prefix, 'W')]) + \ tparams[pp(prefix, 'b')] # input to compute the hidden state proposal state_belowx = tensor.dot(state_below*emb_dropout[1], tparams[pp(prefix, 'Wx')]) + \ tparams[pp(prefix, 'bx')] # step function to be used by scan # arguments | sequences |outputs-info| non-seqs def _step_slice(m_, x_, xx_, h_, U, Ux, rec_dropout): preact = tensor.dot(h_ * rec_dropout[0], U) preact += x_ # reset and update gates r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) # compute the hidden state proposal preactx = tensor.dot(h_ * rec_dropout[1], Ux) preactx = preactx * r preactx = preactx + xx_ # hidden state proposal h = tensor.tanh(preactx) # leaky integrate and obtain next hidden state h = u * h_ + (1. - u) * h h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h # prepare scan arguments seqs = [mask, state_below_, state_belowx] init_states = [tensor.alloc(0., n_samples, dim)] _step = _step_slice shared_vars = [ tparams[pp(prefix, 'U')], tparams[pp(prefix, 'Ux')], rec_dropout ] rval, updates = theano.scan(_step, sequences=seqs, outputs_info=init_states, non_sequences=shared_vars, name=pp(prefix, '_layers'), n_steps=nsteps, truncate_gradient=truncate_gradient, profile=profile, strict=True) rval = [rval] return rval
def gru_layer(tparams, state_below, options, prefix='gru', mask=None, **kwargs): """ Forward pass through GRU layer """ nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 dim = tparams[_p(prefix, 'Ux')].shape[1] if mask == None: mask = tensor.alloc(1., state_below.shape[0], 1) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] state_below_ = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] state_belowx = tensor.dot(state_below, tparams[_p( prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] U = tparams[_p(prefix, 'U')] Ux = tparams[_p(prefix, 'Ux')] def _step_slice(m_, x_, xx_, h_, U, Ux): preact = tensor.dot(h_, U) preact += x_ r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) preactx = tensor.dot(h_, Ux) preactx = preactx * r preactx = preactx + xx_ h = tensor.tanh(preactx) h = u * h_ + (1. - u) * h h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h seqs = [mask, state_below_, state_belowx] _step = _step_slice rval, updates = theano.scan( _step, sequences=seqs, outputs_info=[tensor.alloc(0., n_samples, dim)], non_sequences=[tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]], name=_p(prefix, '_layers'), n_steps=nsteps, profile=profile, strict=True) rval = [rval] return rval
def BlockGLSTMScanArrayToArray(rng, inlayer, szgate, szhidden, blocksize=10, warmup=10, outf=T.tanh, noot=False, backwards=False, shareLayer=None, warmupHidden=None, warmupOut=None): if backwards: inout = inlayer.output[::-1] else: inout = inlayer.output if warmupHidden != None: if backwards: whid = warmupHidden.output[::-1] else: whid = warmupHidden.output if warmupOut != None: if backwards: wout = warmupOut.output[::-1] else: wout = warmupOut.output #PrepareData totblks = (inlayer.output.shape[0] + blocksize - 1) / blocksize def oneStep(inp, laststate, lastout): inl = SymbolLayer(inp, (totblks, inlayer.output_shape[1])) lstmout = LCollect( GLSTM(rng, inl, laststate, lastout, szgate, szhidden, outf=outf, noot=noot, shareLayer=shareLayer)) return lstmout.hidden, lstmout.output stackinp = T.alloc(dtypeX(0), totblks, blocksize + warmup, inlayer.output_shape[1]) #Fill block data stackinp = T.set_subtensor( stackinp[:-1, warmup:], inout[:(totblks - 1) * blocksize].reshape( (totblks - 1, blocksize, inlayer.output.shape[1]))) stackinp = T.set_subtensor( stackinp[-1, warmup:warmup + inlayer.output.shape[0] - (totblks - 1) * blocksize], inout[(totblks - 1) * blocksize:].reshape( (inlayer.output.shape[0] - (totblks - 1) * blocksize, inlayer.output.shape[1]))) #Fill block warmup data stackinp = T.set_subtensor(stackinp[1:, :warmup], stackinp[:-1, -warmup:]) stackinp = stackinp.dimshuffle(1, 0, 2) LPush() #A large number firsthidden = T.alloc( dtypeX(0), totblks, szhidden ) #T.as_tensor_variable(np.zeros((1000,szhidden),'f'))[:totblks] if warmupHidden: firsthidden = T.set_subtensor( firsthidden[warmup / blocksize + 1:], whid[-warmup + blocksize * (warmup / blocksize + 1):-warmup + blocksize * totblks:blocksize]) firstout = T.alloc( dtypeX(0), totblks, szhidden ) #T.as_tensor_variable(np.zeros((1000,szhidden),'f'))[:totblks] if warmupOut: firstout = T.set_subtensor( firstout[warmup / blocksize + 1:], wout[-warmup + blocksize * (warmup / blocksize + 1):-warmup + blocksize * totblks:blocksize]) (hiddens, outs), updates = theano.scan(fn=oneStep, outputs_info=[firsthidden, firstout], sequences=stackinp) lstml = LPop()[0] #ExpandData hiddens = hiddens.dimshuffle(1, 0, 2) hiddens = hiddens[:, warmup:].reshape( (totblks * blocksize, szhidden))[:inlayer.output.shape[0]] outs = outs.dimshuffle(1, 0, 2) outs = outs[:, warmup:].reshape( (totblks * blocksize, szhidden))[:inlayer.output.shape[0]] if backwards: hiddens = hiddens[::-1] outs = outs[::-1] global extraHid extraHid = SymbolLayer(hiddens, (inlayer.output_shape[0], szhidden)) return SymbolLayer(outs, (inlayer.output_shape[0], szhidden)), lstml
def BlockLSTMUnrollArrayToArray(rng, inlayer, szhidden, blocksize=10, warmup=10, outf=T.tanh, noot=False, backwards=False, shareLayer=None, warmupHidden=None, warmupOut=None): if backwards: inout = inlayer.output[::-1] else: inout = inlayer.output if warmupHidden != None: if backwards: whid = warmupHidden.output[::-1] else: whid = warmupHidden.output if warmupOut != None: if backwards: wout = warmupOut.output[::-1] else: wout = warmupOut.output #PrepareData totblks = (inlayer.output.shape[0] + blocksize - 1) / blocksize def oneStep(inp, laststate, lastout): inl = SymbolLayer(inp, (totblks, inlayer.output_shape[1])) lstmout = LSTM(rng, inl, laststate, lastout, szhidden, outf=outf, noot=noot, shareLayer=shareLayer) return lstmout.hidden, lstmout.output, lstmout stackinp = T.alloc(dtypeX(0), totblks, blocksize + warmup, inlayer.output_shape[1]) #Fill block data stackinp = T.set_subtensor( stackinp[:-1, warmup:], inout[:(totblks - 1) * blocksize].reshape( (totblks - 1, blocksize, inlayer.output.shape[1]))) stackinp = T.set_subtensor( stackinp[-1, warmup:warmup + inlayer.output.shape[0] - (totblks - 1) * blocksize], inout[(totblks - 1) * blocksize:].reshape( (inlayer.output.shape[0] - (totblks - 1) * blocksize, inlayer.output.shape[1]))) #Fill block warmup data stackinp = T.set_subtensor(stackinp[1:, :warmup], stackinp[:-1, -warmup:]) stackinp = stackinp.dimshuffle(1, 0, 2) #A large number firsthidden = T.alloc( dtypeX(0), totblks, szhidden ) #T.as_tensor_variable(np.zeros((1000,szhidden),'f'))[:totblks] if warmupHidden: firsthidden = T.set_subtensor( firsthidden[warmup / blocksize + 1:], whid[-warmup + blocksize * (warmup / blocksize + 1):-warmup + blocksize * totblks:blocksize]) firstout = T.alloc( dtypeX(0), totblks, szhidden ) #T.as_tensor_variable(np.zeros((1000,szhidden),'f'))[:totblks] if warmupOut: firstout = T.set_subtensor( firstout[warmup / blocksize + 1:], wout[-warmup + blocksize * (warmup / blocksize + 1):-warmup + blocksize * totblks:blocksize]) hiddens = [] outs = [] firstshare = None for i in range(warmup): firsthidden, firstout, shareLayer = oneStep(stackinp[i], firsthidden, firstout) if firstshare == None: firstshare = shareLayer for i in range(blocksize): firsthidden, firstout, shareLayer = oneStep(stackinp[i + warmup], firsthidden, firstout) if firstshare == None: firstshare = shareLayer hiddens.append(firsthidden) outs.append(firstout) hiddens = T.stack(*hiddens) outs = T.stack(*outs) #ExpandData (warmup is automatically eatten) hiddens = hiddens.dimshuffle(1, 0, 2) hiddens = hiddens.reshape( (totblks * blocksize, szhidden))[:inlayer.output.shape[0]] outs = outs.dimshuffle(1, 0, 2) outs = outs.reshape( (totblks * blocksize, szhidden))[:inlayer.output.shape[0]] if backwards: hiddens = hiddens[::-1] outs = outs[::-1] global extraHid extraHid = SymbolLayer(hiddens, (inlayer.output_shape[0], szhidden)) return SymbolLayer(outs, (inlayer.output_shape[0], szhidden)), firstshare
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data if options['device'] != 'gpu': xdata = theano.shared(data['train_x'][:options['gbs']], name='xdata') ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] else: self.cpu_shared_data = [] xdata = theano.shared(data['train_x'], name='xdata') ydata = TT._shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] if options['device'] != 'gpu': # Store eucledian gradients self.gs = [ TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs = [ TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] else: # Store eucledian gradients self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients # inputs gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] return [args[0] + const(1)] + \ nw_gs ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig, n_steps=n_steps, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] # updates updates.update(dict(zip(self.gs, nw_gs))) # givens if options['device'] == 'gpu': grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] if options['device'] == 'gpu': mode = gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const( options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates else: mode = cpu_mode def compute_Gv(*args): cgv = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name='cgv%d' % idx) for idx, shp in enumerate(model.params_shape) ] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const( options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [ TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:] ] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates=updates, givens=dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile=options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {} print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs], rtol=options['mrtol'], shift=-options['mreg'], maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs[:1], shared_data[:1])] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0 ], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] acc0 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1] / const(n_steps) if options['device'] == 'gpu': grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'compling evaluation function' self.eval_fn = theano.function([ebdx, lr], final_cost, givens=dict(grad_inps), on_unused_input='warn', updates=updates, name='eval_fn', mode=gpu_mode, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) if options['device'] != 'gpu': update_dict.update(dict(zip(model.cparams, nw_ps))) self.update_params = theano.function([lr], [], updates=update_dict, name='update_params', on_unused_input='warn', mode=mode, profile=options['profile']) self.options = options self.old_cost = 1e6 self.device = options['device'] n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=cpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def __init__(self, enc_h, mask, emb_mat, vocab_size, emb_dim, hidden_dim, eos_token, batch_size, max_len, init='uniform', inner_init='orthonormal', activation=T.tanh, params=None, max_response=100): self.enc_h = enc_h self.mask = mask self.eos_token = eos_token self.batch_size = batch_size self.activation = activation self.max_response = max_response if params is None: self.emb = theano.shared(value=np.asarray( emb_mat, dtype=theano.config.floatX), name='emb', borrow=True) self.W = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W', borrow=True) self.U = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U', borrow=True) self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, vocab_size)), name='V', borrow=True) self.bh = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bh', borrow=True) self.by = theano.shared(value=get(identifier='zero', shape=(vocab_size, )), name='by', borrow=True) # to weight 'context' from encoder self.c_h = theano.shared(value=get(identifier=init, shape=(hidden_dim, hidden_dim)), name='c_h', borrow=True) self.c_y = theano.shared(value=get(identifier=init, shape=(hidden_dim, vocab_size)), name='c_y', borrow=True) # to weight 'y_t-1' for decoder's 'y' self.y_t1 = theano.shared(value=get(identifier=init, shape=(emb_dim, vocab_size)), name='y_t1', borrow=True) else: self.emb, self.W, self.U, self.V, self.bh, self.by, self.c_h, self.c_y, self.y_t1 = params self.params = [ self.emb, self.W, self.U, self.V, self.bh, self.by, self.c_h, self.c_y, self.y_t1 ] self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) # y(t-1) from encoder will always be 'eos' token self.y0 = theano.shared(value=np.asarray(np.full((batch_size, ), self.eos_token), dtype='int32'), name='y0', borrow=True) # remember for decoder both h_t and y_t are conditioned on 'enc_h' & 'y_t-1'. def recurrence(msk, h_tm_prev, y_tm_prev): h_t = self.activation( T.dot(self.emb[y_tm_prev], self.W) + T.dot(h_tm_prev, self.U) + T.dot(self.enc_h, self.c_h) + self.bh) # needed to back-propagate errors y_d_t = T.dot(h_t, self.V) + T.dot(self.enc_h, self.c_y) + T.dot( self.emb[y_tm_prev], self.y_t1) + self.by # ignore padded tokens y_d_t = T.batched_dot(y_d_t, msk) y_d = T.clip(T.nnet.softmax(y_d_t), 0.0001, 0.9999) y_t = T.argmax(y_d, axis=1) return h_t, y_d, T.cast(y_t.flatten(), 'int32') [_, y_dist, y], _ = theano.scan( fn=recurrence, sequences=mask.dimshuffle( 1, 0), # ugly, but we have to go till the end outputs_info=[ T.alloc(self.h0, self.enc_h.shape[0], hidden_dim), None, T.alloc(self.y0, self.enc_h.shape[0]) ], n_steps=max_len) self.y = y.dimshuffle(1, 0) self.y_dist = y_dist.dimshuffle(1, 0, 2)
def tree_lstm_layer(tparams, inputs, options, prefix='tree_lstm', **kwargs): state_below, mask, left_mask, right_mask = inputs # state_below: #step x #sample x dim_emb # mask: #step x #sample # left_mask: #step x #sample x #step # right_mask: #step x #sample x #step nsteps = state_below.shape[0] dim = tparams[_p(prefix, 'U_l')].shape[0] n_samples = state_below.shape[1] init_state = tensor.alloc(0., n_samples, nsteps, dim) init_memory = tensor.alloc(0., n_samples, nsteps, dim) # use the slice to calculate all the different gates def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] elif _x.ndim == 2: return _x[:, n * dim:(n + 1) * dim] return _x[n * dim:(n + 1) * dim] # one time step of the lstm def _step(m_, x_, left_mask_, right_mask_, counter_, h_, c_): # zero out the input unless this is a leaf node # flag = tensor.switch(tensor.eq(tensor.sum(left_mask_, axis=1) + tensor.sum(right_mask_, axis=1), 0), 1., 0.) # x_ = x_ * flag[:, None] preact_l = tensor.dot(tensor.sum(left_mask_[:, :, None] * h_, axis=1), tparams[_p(prefix, 'U_l')]) preact_r = tensor.dot(tensor.sum(right_mask_[:, :, None] * h_, axis=1), tparams[_p(prefix, 'U_r')]) x_ = concatenate([ _slice(x_, 0, dim), _slice(x_, 1, dim), _slice(x_, 1, dim), _slice(x_, 2, dim), _slice(x_, 3, dim) ], axis=1) preact = preact_l + preact_r + x_ i = tensor.nnet.sigmoid(_slice(preact, 0, dim)) fl = tensor.nnet.sigmoid(_slice(preact, 1, dim)) fr = tensor.nnet.sigmoid(_slice(preact, 2, dim)) o = tensor.nnet.sigmoid(_slice(preact, 3, dim)) u = tensor.tanh(_slice(preact, 4, dim)) c_temp = fl * tensor.sum(left_mask_[:, :, None] * c_, axis=1) \ + fr * tensor.sum(right_mask_[:, :, None] * c_, axis=1) \ + i * u h_temp = o * tensor.tanh(c_temp) h = tensor.set_subtensor(h_[:, counter_, :], h_temp) c = tensor.set_subtensor(c_[:, counter_, :], c_temp) c = m_[:, None, None] * c + (1. - m_)[:, None, None] * c_ h = m_[:, None, None] * h + (1. - m_)[:, None, None] * h_ return h, c, i, fl, fr, o state_below = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] rval, updates = theano.scan( fn=_step, sequences=[ mask, state_below, left_mask, right_mask, tensor.arange(0, nsteps) ], outputs_info=[init_state, init_memory, None, None, None, None], name=_p(prefix, '_layers'), profile=False) return rval
def compute_Gv(*args): idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const( options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates
def build_sampler(self, **kwargs): x = tensor.matrix('x', dtype=INT) xr = x[::-1] n_timesteps = x.shape[0] n_samples = x.shape[1] # word embedding (source), forward and backward emb = self.tparams['Wemb_enc'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, self.embedding_dim]) embr = self.tparams['Wemb_enc'][xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, self.embedding_dim]) # encoder proj = get_new_layer(self.enc_type)[1](self.tparams, emb, prefix='encoder', layernorm=self.lnorm) projr = get_new_layer(self.enc_type)[1](self.tparams, embr, prefix='encoder_r', layernorm=self.lnorm) # concatenate forward and backward rnn hidden states ctx = [ tensor.concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1) ] for i in range(1, self.n_enc_layers): ctx = get_new_layer(self.enc_type)[1](self.tparams, ctx[0], prefix='deepencoder_%d' % i, layernorm=self.lnorm) ctx = ctx[0] if self.init_cgru == 'text' and 'ff_state_W' in self.tparams: # get the input for decoder rnn initializer mlp ctx_mean = ctx.mean(0) init_state = get_new_layer('ff')[1](self.tparams, ctx_mean, prefix='ff_state', activ='tanh') else: # assume zero-initialized decoder init_state = tensor.alloc(0., n_samples, self.rnn_dim) outs = [init_state, ctx] self.f_init = theano.function([x], outs, name='f_init') # x: 1 x 1 y1 = tensor.vector('y1_sampler', dtype=INT) y2 = tensor.vector('y2_sampler', dtype=INT) init_state = tensor.matrix('init_state', dtype=FLOAT) # if it's the first word, emb should be all zero and it is indicated by -1 emb_lem = tensor.switch( y1[:, None] < 0, tensor.alloc(0., 1, self.tparams['Wemb_dec_lem'].shape[1]), self.tparams['Wemb_dec_lem'][y1]) emb_fact = tensor.switch( y2[:, None] < 0, tensor.alloc(0., 1, self.tparams['Wemb_dec_fact'].shape[1]), self.tparams['Wemb_dec_fact'][y2]) # Concat the 2 embeddings emb_prev = tensor.concatenate([emb_lem, emb_fact], axis=1) # apply one step of conditional gru with attention # get the next hidden states # get the weighted averages of contexts for this target word y r = get_new_layer('gru_cond')[1](self.tparams, emb_prev, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state, layernorm=False) next_state = r[0] ctxs = r[1] alphas = r[2] logit_lem = get_new_layer('ff')[1](self.tparams, emb_lem, prefix='ff_logit_lem', activ='linear') logit_fact = get_new_layer('ff')[1](self.tparams, emb_fact, prefix='ff_logit_fact', activ='linear') logit_ctx = get_new_layer('ff')[1](self.tparams, ctxs, prefix='ff_logit_ctx', activ='linear') logit_gru = get_new_layer('ff')[1](self.tparams, next_state, prefix='ff_logit_gru', activ='linear') logit1 = tanh(logit_gru + logit_lem + logit_ctx) logit2 = tanh(logit_gru + logit_fact + logit_ctx) if self.tied_trg_emb is False: logit = get_new_layer('ff')[1](self.tparams, logit1, prefix='ff_logit', activ='linear') logit_trgmult = get_new_layer('ff')[1](self.tparams, logit2, prefix='ff_logit_trgmult', activ='linear') else: logit_trg = tensor.dot(logit1, self.tparams['Wemb_dec_lem'].T) logit_trgmult = tensor.dot(logit2, self.tparams['Wemb_dec_fact'].T) # compute the logsoftmax next_log_probs_trg = tensor.nnet.logsoftmax(logit_trg) next_log_probs_trgmult = tensor.nnet.logsoftmax(logit_trgmult) # Sample from the softmax distribution next_probs_trg = tensor.exp(next_log_probs_trg) next_probs_trgmult = tensor.exp(next_log_probs_trgmult) next_word_trg = self.trng.multinomial(pvals=next_probs_trg).argmax(1) next_word_trgmult = self.trng.multinomial( pvals=next_probs_trgmult).argmax(1) # NOTE: We never use sampling and it incurs performance penalty # let's disable it for now #next_word = self.trng.multinomial(pvals=next_probs).argmax(1) # compile a function to do the whole thing above # next hidden state to be used inputs = [y1, y2, init_state, ctx] outs = [next_log_probs_trg, next_log_probs_trgmult, next_state, alphas] self.f_next = theano.function(inputs, outs, name='f_next')
def gru_decoder_multi(tparams, state_below, ctx1, ctx2, prefix='gru_decoder_multi', input_mask=None, one_step=False, init_state=None, ctx1_mask=None): if one_step: assert init_state, 'previous state must be provided' # Context # n_timesteps x n_samples x ctxdim assert ctx1 and ctx2, 'Contexts must be provided' assert ctx1.ndim == 3 and ctx2.ndim == 3, 'Contexts must be 3-d: #annotation x #sample x dim' # Number of padded source timesteps nsteps = state_below.shape[0] # Batch or single sample? n_samples = state_below.shape[1] if state_below.ndim == 3 else 1 # if we have no mask, we assume all the inputs are valid # tensor.alloc(value, *shape) # input_mask: (n_steps, 1) filled with 1 if input_mask is None: input_mask = tensor.alloc(1., nsteps, 1) # Infer RNN dimensionality dim = tparams[pp(prefix, 'Wcx')].shape[1] # initial/previous state # if not given, assume it's all zeros if init_state is None: init_state = tensor.alloc(0., n_samples, dim) # These two dot products are same with gru_layer, refer to the equations. # [W_r * X + b_r, W_z * X + b_z] state_below_ = tensor.dot(state_below, tparams[pp(prefix, 'W')]) + tparams[pp(prefix, 'b')] # input to compute the hidden state proposal # This is the [W*x]_j in the eq. 8 of the paper state_belowx = tensor.dot(state_below, tparams[pp(prefix, 'Wx')]) + tparams[pp(prefix, 'bx')] # Wc_att: dimctx -> dimctx # Linearly transform the contexts to another space with same dimensionality pctx1_ = tensor.dot(ctx1, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')] pctx2_ = tensor.dot(ctx2, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')] # Step function for the recurrence/scan # Sequences # --------- # m_ : mask # x_ : state_below_ # xx_ : state_belowx # outputs_info # ------------ # h_ : init_state, # ctx_ : need to be defined as it's returned by _step # alpha1_: need to be defined as it's returned by _step # alpha2_: need to be defined as it's returned by _step # non sequences # ------------- # pctx1_ : pctx1_ # pctx2_ : pctx2_ # cc1_ : ctx1 # cc2_ : ctx2 # and all the shared weights and biases.. def _step(m_, x_, xx_, h_, ctx_, alpha1_, alpha2_, # These ctx and alpha's are not used in the computations pctx1_, pctx2_, cc1_, cc2_, U, Wc, W_comb_att, U_att, c_att, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl): # Do a step of classical GRU h1 = gru_step(m_, x_, xx_, h_, U, Ux) ########### # Attention ########### # h1 X W_comb_att # W_comb_att: dim -> dimctx # pstate_ should be 2D as we're working with unrolled timesteps pstate_ = tensor.dot(h1, W_comb_att) # Accumulate in pctx*__ and apply tanh() # This becomes the projected context(s) + the current hidden state # of the decoder, e.g. this is the information accumulating # into the returned original contexts with the knowledge of target # sentence decoding. pctx1__ = tanh(pctx1_ + pstate_[None, :, :]) pctx2__ = tanh(pctx2_ + pstate_[None, :, :]) # Affine transformation for alpha* = (pctx*__ X U_att) + c_att # We're now down to scalar alpha's for each accumulated # context (0th dim) in the pctx*__ # alpha1 should be n_timesteps, 1, 1 alpha1 = tensor.dot(pctx1__, U_att) + c_att alpha2 = tensor.dot(pctx2__, U_att) + c_att # Drop the last dimension, e.g. (n_timesteps, 1) alpha1 = alpha1.reshape([alpha1.shape[0], alpha1.shape[1]]) alpha2 = alpha2.reshape([alpha2.shape[0], alpha2.shape[1]]) # Exponentiate alpha1 alpha1 = tensor.exp(alpha1 - alpha1.max(0, keepdims=True)) alpha2 = tensor.exp(alpha2 - alpha2.max(0, keepdims=True)) # If there is a context mask, multiply with it to cancel unnecessary steps # We won't have a ctx_mask for image vectors if ctx1_mask: alpha1 = alpha1 * ctx1_mask # Normalize so that the sum makes 1 alpha1 = alpha1 / alpha1.sum(0, keepdims=True) alpha2 = alpha2 / alpha2.sum(0, keepdims=True) # Compute the current context ctx*_ as the alpha-weighted sum of # the initial contexts ctx*'s ctx1_ = (cc1_ * alpha1[:, :, None]).sum(0) ctx2_ = (cc2_ * alpha2[:, :, None]).sum(0) # n_samples x ctxdim (2000) # Sum of contexts ctx_ = tanh(ctx1_ + ctx2_) ############################################ # ctx*_ and alpha computations are completed ############################################ #################################### # The below code is another GRU cell #################################### # Affine transformation: h1 X U_nl + b_nl # U_nl, b_nl: Stacked dim*2 preact = tensor.dot(h1, U_nl) + b_nl # Transform the weighted context sum with Wc # and add it to preact # Wc: dimctx -> Stacked dim*2 preact += tensor.dot(ctx_, Wc) # Apply sigmoid nonlinearity preact = sigmoid(preact) # Slice activations: New gates r2 and u2 r2 = tensor_slice(preact, 0, dim) u2 = tensor_slice(preact, 1, dim) preactx = (tensor.dot(h1, Ux_nl) + bx_nl) * r2 preactx += tensor.dot(ctx_, Wcx) # Candidate hidden h2_tilda = tanh(preactx) # Leaky integration between the new h2 and the # old h1 computed in line 285 h2 = u2 * h2_tilda + (1. - u2) * h1 h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 return h2, ctx_, alpha1.T, alpha2.T # Sequences are the input mask and the transformed target embeddings seqs = [input_mask, state_below_, state_belowx] # Create a list of shared parameters for easy parameter passing shared_vars = [tparams[pp(prefix, 'U')], tparams[pp(prefix, 'Wc')], tparams[pp(prefix, 'W_comb_att')], tparams[pp(prefix, 'U_att')], tparams[pp(prefix, 'c_att')], tparams[pp(prefix, 'Ux')], tparams[pp(prefix, 'Wcx')], tparams[pp(prefix, 'U_nl')], tparams[pp(prefix, 'Ux_nl')], tparams[pp(prefix, 'b_nl')], tparams[pp(prefix, 'bx_nl')]] if one_step: rval = _step(*(seqs + [init_state, None, None, None, pctx1_, pctx2_, ctx1, ctx2] + shared_vars)) else: outputs_info=[init_state, tensor.alloc(0., n_samples, ctx1.shape[2]), # ctxdim (ctx_) tensor.alloc(0., n_samples, ctx1.shape[0]), # n_timesteps (alpha1) tensor.alloc(0., n_samples, ctx2.shape[0])] # n_timesteps (alpha2) rval, updates = theano.scan(_step, sequences=seqs, outputs_info=outputs_info, non_sequences=[pctx1_, pctx2_, ctx1, ctx2] + shared_vars, name=pp(prefix, '_layers'), n_steps=nsteps, strict=True) return rval
# -*- coding: utf-8 -*- """ theano.tensor.alloc(value,*shape):生成一个变化的tensor,维度是shape大小的, 但是值但是由value填充。 """ import numpy as np import theano import theano.tensor as T X = T.matrix() e = T.alloc(1, 4, 3) p = theano.function([X], e + X) a = np.random.rand(4, 3).astype('float32') print a print p(a)
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None, **kwargs): nsteps = state_below.shape[0] dim = tparams[_p(prefix, 'U')].shape[0] # if we are dealing with a mini-batch if state_below.ndim == 3: n_samples = state_below.shape[1] init_state = tensor.alloc(0., n_samples, dim) init_memory = tensor.alloc(0., n_samples, dim) # during sampling else: n_samples = 1 init_state = tensor.alloc(0., dim) init_memory = tensor.alloc(0., dim) # if we have no mask, we assume all the inputs are valid if mask == None: mask = tensor.alloc(1., state_below.shape[0], 1) # use the slice to calculate all the different gates def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] elif _x.ndim == 2: return _x[:, n * dim:(n + 1) * dim] return _x[n * dim:(n + 1) * dim] # one time step of the lstm def _step(m_, x_, h_, c_): preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ i = tensor.nnet.sigmoid(_slice(preact, 0, dim)) f = tensor.nnet.sigmoid(_slice(preact, 1, dim)) o = tensor.nnet.sigmoid(_slice(preact, 2, dim)) c = tensor.tanh(_slice(preact, 3, dim)) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h, c, i, f, o, preact state_below = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] rval, updates = theano.scan( _step, sequences=[mask, state_below], outputs_info=[init_state, init_memory, None, None, None, None], name=_p(prefix, '_layers'), n_steps=nsteps, profile=False) return rval
# one step function that will be used by scan def oneStep(x_t, h_tm1, W_x, W_h, W_o): h_t = tensor.tanh(tensor.dot(x_t, W_x) + tensor.dot(h_tm1, W_h)) o_t = tensor.dot(h_t, W_o) return h_t, o_t # spawn theano tensor variable, our symbolic input # a 3D tensor (n_steps, n_samples, dim) x = tensor.tensor3(dtype='float32') # initial state of our rnn init_state = tensor.alloc(0., n_samples, dim) # create parameters that we will use, # note that, parameters are theano shared variables # parameters for input to hidden states W_x_ = numpy.random.randn(input_dim, dim).astype('float32') W_x = theano.shared(W_x_) # parameters for hidden state transition W_h_ = numpy.random.randn(dim, dim).astype('float32') W_h = theano.shared(W_h_) # parameters from hidden state to output W_o_ = numpy.random.randn(dim, output_dim).astype('float32') W_o = theano.shared(W_o_)
def build(self): # description string: #words x #samples x = tensor.matrix('x', dtype=INT) x_mask = tensor.matrix('x_mask', dtype=FLOAT) y1 = tensor.matrix('y1', dtype=INT) y1_mask = tensor.matrix('y1_mask', dtype=FLOAT) y2 = tensor.matrix('y2', dtype=INT) y2_mask = tensor.matrix('y2_mask', dtype=FLOAT) self.inputs = OrderedDict() self.inputs['x'] = x self.inputs['x_mask'] = x_mask self.inputs['y1'] = y1 self.inputs['y2'] = y2 self.inputs['y1_mask'] = y1_mask self.inputs['y2_mask'] = y2_mask # for the backward rnn, we just need to invert x and x_mask xr = x[::-1] xr_mask = x_mask[::-1] n_timesteps = x.shape[0] n_timesteps_trg = y1.shape[0] n_timesteps_trgmult = y2.shape[0] n_samples = x.shape[1] # word embedding for forward rnn (source) emb = dropout(self.tparams['Wemb_enc'][x.flatten()], self.trng, self.emb_dropout, self.use_dropout) emb = emb.reshape([n_timesteps, n_samples, self.embedding_dim]) proj = get_new_layer(self.enc_type)[1](self.tparams, emb, prefix='encoder', mask=x_mask, layernorm=self.lnorm) # word embedding for backward rnn (source) embr = dropout(self.tparams['Wemb_enc'][xr.flatten()], self.trng, self.emb_dropout, self.use_dropout) embr = embr.reshape([n_timesteps, n_samples, self.embedding_dim]) projr = get_new_layer(self.enc_type)[1](self.tparams, embr, prefix='encoder_r', mask=xr_mask, layernorm=self.lnorm) # context will be the concatenation of forward and backward rnns ctx = [ tensor.concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1) ] for i in range(1, self.n_enc_layers): ctx = get_new_layer(self.enc_type)[1](self.tparams, ctx[0], prefix='deepencoder_%d' % i, mask=x_mask, layernorm=self.lnorm) # Apply dropout ctx = dropout(ctx[0], self.trng, self.ctx_dropout, self.use_dropout) if self.init_cgru == 'text': # mean of the context (across time) will be used to initialize decoder rnn ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] init_state = get_new_layer('ff')[1](self.tparams, ctx_mean, prefix='ff_state', activ='tanh') else: # Assume zero-initialized decoder init_state = tensor.alloc(0., n_samples, self.rnn_dim) # word embedding (target), we will shift the target sequence one time step # to the right. This is done because of the bi-gram connections in the # readout and decoder rnn. The first target will be all zeros and we will emb_lem = self.tparams['Wemb_dec_lem'][y1.flatten()] emb_lem = emb_lem.reshape( [n_timesteps_trg, n_samples, self.embedding_dim]) emb_lem_shifted = tensor.zeros_like(emb_lem) emb_lem_shifted = tensor.set_subtensor(emb_lem_shifted[1:], emb_lem[:-1]) emb_lem = emb_lem_shifted emb_fact = self.tparams['Wemb_dec_fact'][y2.flatten()] emb_fact = emb_fact.reshape( [n_timesteps_trgmult, n_samples, self.embedding_dim]) emb_fact_shifted = tensor.zeros_like(emb_fact) emb_fact_shifted = tensor.set_subtensor(emb_fact_shifted[1:], emb_fact[:-1]) emb_fact = emb_fact_shifted # Concat the 2 embeddings emb_prev = tensor.concatenate([emb_lem, emb_fact], axis=2) # decoder - pass through the decoder conditional gru with attention proj = get_new_layer('gru_cond')[1](self.tparams, emb_prev, prefix='decoder', mask=y1_mask, context=ctx, context_mask=x_mask, one_step=False, init_state=init_state, layernorm=False) # hidden states of the decoder gru proj_h = proj[0] # weighted averages of context, generated by attention module ctxs = proj[1] # weights (alignment matrix) self.alphas = proj[2] # compute word probabilities logit_gru = get_new_layer('ff')[1](self.tparams, proj_h, prefix='ff_logit_gru', activ='linear') logit_ctx = get_new_layer('ff')[1](self.tparams, ctxs, prefix='ff_logit_ctx', activ='linear') logit_lem = get_new_layer('ff')[1](self.tparams, emb_lem, prefix='ff_logit_lem', activ='linear') logit_fact = get_new_layer('ff')[1](self.tparams, emb_fact, prefix='ff_logit_fact', activ='linear') logit1 = dropout(tanh(logit_gru + logit_lem + logit_ctx), self.trng, self.out_dropout, self.use_dropout) logit2 = dropout(tanh(logit_gru + logit_fact + logit_ctx), self.trng, self.out_dropout, self.use_dropout) if self.tied_trg_emb is False: logit_trg = get_new_layer('ff')[1](self.tparams, logit1, prefix='ff_logit_trg', activ='linear') logit_trgmult = get_new_layer('ff')[1](self.tparams, logit2, prefix='ff_logit_trgmult', activ='linear') else: logit_trg = tensor.dot(logit1, self.tparams['Wemb_dec_lem'].T) logit_trgmult = tensor.dot(logit2, self.tparams['Wemb_dec_fact'].T) logit_trg_shp = logit_trg.shape logit_trgmult_shp = logit_trgmult.shape # Apply logsoftmax (stable version) log_trg_probs = -tensor.nnet.logsoftmax( logit_trg.reshape( [logit_trg_shp[0] * logit_trg_shp[1], logit_trg_shp[2]])) log_trgmult_probs = -tensor.nnet.logsoftmax( logit_trgmult.reshape([ logit_trgmult_shp[0] * logit_trgmult_shp[1], logit_trgmult_shp[2] ])) # cost y1_flat = y1.flatten() y2_flat = y2.flatten() y1_flat_idx = tensor.arange( y1_flat.shape[0]) * self.n_words_trg1 + y1_flat y2_flat_idx = tensor.arange( y2_flat.shape[0]) * self.n_words_trg2 + y2_flat cost_trg = log_trg_probs.flatten()[y1_flat_idx] cost_trg = cost_trg.reshape([n_timesteps_trg, n_samples]) cost_trg = (cost_trg * y1_mask).sum(0) cost_trgmult = log_trgmult_probs.flatten()[y2_flat_idx] cost_trgmult = cost_trgmult.reshape([n_timesteps_trgmult, n_samples]) cost_trgmult = (cost_trgmult * y2_mask).sum(0) cost = cost_trg + cost_trgmult self.f_log_probs = theano.function(list(self.inputs.values()), cost) # For alpha regularization return cost
def ReplicateLayer(x, n_times): a = T.shape_padleft(x) padding = [1] * x.ndim b = T.alloc(numpy.float32(1), n_times, *padding) return a * b
def recurrent_apply(brick, application, application_call, *args, **kwargs): """Iterates a transition function. Parameters ---------- iterate : bool If ``True`` iteration is made. By default ``True``. reverse : bool If ``True``, the sequences are processed in backward direction. ``False`` by default. return_initial_states : bool If ``True``, initial states are included in the returned state tensors. ``False`` by default. .. todo:: * Handle `updates` returned by the :func:`theano.scan` routine. * ``kwargs`` has a random order; check if this is a problem. """ # Extract arguments related to iteration and immediately relay the # call to the wrapped function if `iterate=False` iterate = kwargs.pop('iterate', True) if not iterate: return application_function(brick, *args, **kwargs) reverse = kwargs.pop('reverse', False) return_initial_states = kwargs.pop('return_initial_states', False) # Push everything to kwargs for arg, arg_name in zip(args, arg_names): kwargs[arg_name] = arg # Separate sequences, states and contexts scan_arguments = (application.sequences + application.states + application.contexts) # Check what is given and what is not def only_given(arg_names): return OrderedDict((arg_name, kwargs[arg_name]) for arg_name in arg_names if kwargs.get(arg_name)) sequences_given = only_given(application.sequences) contexts_given = only_given(application.contexts) # TODO Assumes 1 time dim! if len(sequences_given): shape = list(sequences_given.values())[0].shape if not iterate: batch_size = shape[0] else: n_steps = shape[0] batch_size = shape[1] else: # TODO Raise error if n_steps and batch_size not found? n_steps = kwargs.pop('n_steps') batch_size = kwargs.pop('batch_size') # Handle the rest kwargs rest_kwargs = { key: value for key, value in kwargs.items() if key not in scan_arguments } for value in rest_kwargs.values(): if (isinstance(value, Variable) and not is_shared_variable(value)): warnings.warn( 'Your function uses a non-shared variable other than' ' those given by scan explicitly. That can' ' significantly slow down `tensor.grad` call.' ' Did you forget to declare it in `contexts`?') # Ensure that all initial states are available. for state_name in application.states: dim = brick.get_dim(state_name) if state_name in kwargs: if isinstance(kwargs[state_name], NdarrayInitialization): kwargs[state_name] = tensor.alloc( kwargs[state_name].generate(brick.rng, (1, dim)), batch_size, dim) elif isinstance(kwargs[state_name], Application): kwargs[state_name] = \ kwargs[state_name](state_name, batch_size, *args, **kwargs) else: # TODO init_func returns 2D-tensor, fails for iterate=False kwargs[state_name] = \ brick.initial_state(state_name, batch_size, *args, **kwargs) assert kwargs[state_name] states_given = only_given(application.states) assert len(states_given) == len(application.states) # Theano issue 1772 for name, state in states_given.items(): states_given[name] = tensor.unbroadcast( state, *range(state.ndim)) def scan_function(*args): args = list(args) arg_names = (list(sequences_given) + list(states_given) + list(contexts_given)) kwargs = dict(zip(arg_names, args)) kwargs.update(rest_kwargs) outputs = getattr(brick, application_function.__name__)(iterate=False, **kwargs) # We want to save the computation graph returned by the # `application_function` when it is called inside the # `theano.scan`. application_call.inner_inputs = args application_call.inner_outputs = pack(outputs) return outputs outputs_info = ( list(states_given.values()) + [None] * (len(application.outputs) - len(application.states))) result, updates = theano.scan( scan_function, sequences=list(sequences_given.values()), outputs_info=outputs_info, non_sequences=list(contexts_given.values()), n_steps=n_steps, go_backwards=reverse) result = pack(result) if return_initial_states: # Undo Subtensor for i in range(len(states_given)): assert isinstance(result[i].owner.op, tensor.subtensor.Subtensor) result[i] = result[i].owner.inputs[0] if updates: application_call.updates = dict_union(application_call.updates, updates) return result
def lstm_cond_layer(tparams, state_below, options, prefix='lstm', mask=None, init_memory=None, init_state=None, trng=None, use_noise=None, **kwargs): """ Computation graph for the LSTM. Note that we removed 'context' and put this into 'state_below' Video frames need to be part of scan, since it changes each step """ nsteps = state_below.shape[0] n_samples = state_below.shape[1] n_annotations = state_below.shape[2] # mask if mask == None: mask = tensor.alloc(1., state_below.shape[0], 1) dim = tparams[_p(prefix, 'U')].shape[0] # initial/previous state if init_state == None: init_state = tensor.alloc(0., n_samples, dim) # initial/previous memory if init_memory == None: init_memory = tensor.alloc(0., n_samples, dim) def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_, a_, ct_, dp_=None, dp_att_=None): # mask, xt, ht-1, ct-1, alpha, ctx # attention # print '\n\ncheck\n\n' pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')]) # pstate_ pctx_ = tensor.dot(x_, tparams[_p(prefix, 'Wc_att')]) + tparams[_p( prefix, 'b_att')] if options['n_layers_att'] > 1: for lidx in xrange(1, options['n_layers_att']): pctx_ = tensor.dot(pctx_, tparams[_p( prefix, 'W_att_%d' % lidx)]) + tparams[_p( prefix, 'b_att_%d' % lidx)] if lidx < options['n_layers_att'] - 1: pctx_ = tanh(pctx_) pctx_ = pctx_ + pstate_[:, None, :] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix, 'U_att')]) + tparams[_p( prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape alpha = tensor.nnet.softmax( options['temperature_inverse'] * alpha.reshape([alpha_shp[0], alpha_shp[1]])) # softmax ctx_ = (x_ * alpha[:, :, None]).sum(1) # current context # print '\n\ncheck\n\n' if options['selector']: sel_ = tensor.nnet.sigmoid( tensor.dot(h_, tparams[_p(prefix, 'W_sel')]) + tparams[_p(prefix, 'b_sel')]) sel_ = sel_.reshape([sel_.shape[0]]) ctx_ = sel_[:, None] * ctx_ preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += tensor.dot(ctx_, tparams[_p(prefix, 'W')]) + tparams[_p( prefix, 'b')] i = _slice(preact, 0, dim) # z_it f = _slice(preact, 1, dim) # z_ft o = _slice(preact, 2, dim) # z_ot i = tensor.nnet.sigmoid(i) # it = sigmoid(z_it) f = tensor.nnet.sigmoid(f) # ft = sigmoid(z_ft) o = tensor.nnet.sigmoid(o) # ot = sigmoid(z_ot) c = tensor.tanh(_slice(preact, 3, dim)) # at = tanh(z_at) c = f * c_ + i * c # ct = ft * ct-1 + it * at c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) # ht = ot * thanh(ct) h = m_[:, None] * h + (1. - m_)[:, None] * h_ rval = [h, c, alpha, ctx_] if options['selector']: rval += [sel_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre] + pctx_list # print '\n\ncheck\n\n' return rval if options['selector']: _step0 = lambda m_, x_, h_, c_, a_, ct_, sel_: _step( m_, x_, h_, c_, a_, ct_) else: _step0 = lambda m_, x_, h_, c_, a_, ct_: _step(m_, x_, h_, c_, a_, ct_) seqs = [mask, state_below] outputs_info = [ init_state, init_memory, tensor.alloc(0., n_samples, n_annotations), tensor.alloc(0., n_samples, options['ctx_dim']) ] if options['selector']: outputs_info += [tensor.alloc(0., n_samples)] outputs_info += [None, None, None, None, None, None, None ] + [None] #*options['n_layers_att'] rval, updates = theano.scan(_step0, sequences=seqs, outputs_info=outputs_info, name=_p(prefix, '_layers'), n_steps=nsteps, profile=False) return rval
def sample_level_rnn(input_sequences, h0, reset): """ input_sequences.shape: (batch size, seq len) h0.shape: (batch size, N_GRUS, DIM) reset.shape: () output.shape: (batch size, seq len, Q_LEVELS) """ learned_h0 = lib.param( 'SampleLevel.h0', numpy.zeros((N_GRUS, DIM), dtype=theano.config.floatX) ) learned_h0 = T.alloc(learned_h0, h0.shape[0], N_GRUS, DIM) h0 = theano.ifelse.ifelse(reset, learned_h0, h0) # Embedded inputs ################# FRAME_SIZE = Q_LEVELS frames = lib.ops.Embedding('SampleLevel.Embedding', Q_LEVELS, Q_LEVELS, input_sequences) # Real-valued inputs #################### # 'frames' of size 1 # FRAME_SIZE = 1 # frames = input_sequences.reshape(( # input_sequences.shape[0], # input_sequences.shape[1], # 1 # )) # # Rescale frames from ints in [0, Q_LEVELS) to floats in [-2, 2] # # (a reasonable range to pass as inputs to the RNN) # frames = (frames.astype('float32') / lib.floatX(Q_LEVELS/2)) - lib.floatX(1) # frames *= lib.floatX(2) gru0 = lib.ops.LowMemGRU('SampleLevel.GRU0', FRAME_SIZE, DIM, frames, h0=h0[:, 0]) # gru0 = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU0FF', DIM, DIM, gru0, initialization='he')) grus = [gru0] for i in xrange(1, N_GRUS): gru = lib.ops.LowMemGRU('SampleLevel.GRU'+str(i), DIM, DIM, grus[-1], h0=h0[:, i]) # gru = T.nnet.relu(lib.ops.Linear('SampleLevel.GRU'+str(i)+'FF', DIM, DIM, gru, initialization='he')) grus.append(gru) # We apply the softmax later output = lib.ops.Linear( 'Output', N_GRUS*DIM, Q_LEVELS, T.concatenate(grus, axis=2) ) # output = lib.ops.Linear( # 'Output', # DIM, # Q_LEVELS, # grus[-1] # ) last_hidden = T.stack([gru[:,-1] for gru in grus], axis=1) return (output, last_hidden)
def max_pool_b01c(z, pool_shape, top_down=None, theano_rng=None): """ .. todo:: WRITEME properly An implementation of max_pool but where all 4-tensors use the ('b', 0, 1, 'c') format. """ z_name = z.name if z_name is None: z_name = 'anon_z' batch_size, zr, zc, ch = z.shape r, c = pool_shape zpart = [] mx = None if top_down is None: t = 0. else: t = -top_down for i in xrange(r): zpart.append([]) for j in xrange(c): cur_part = z[:, i:zr:r, j:zc:c, :] if z_name is not None: cur_part.name = z_name + '[%d, %d]' % (i, j) zpart[i].append(cur_part) if mx is None: mx = T.maximum(t, cur_part) if cur_part.name is not None: mx.name = 'max(-top_down,' + cur_part.name + ')' else: max_name = None if cur_part.name is not None: mx_name = 'max(' + cur_part.name + ',' + mx.name + ')' mx = T.maximum(mx, cur_part) mx.name = mx_name mx.name = 'local_max(' + z_name + ')' pt = [] for i in xrange(r): pt.append([]) for j in xrange(c): z_ij = zpart[i][j] safe = z_ij - mx safe.name = 'safe_z(%s)' % z_ij.name cur_pt = T.exp(safe) cur_pt.name = 'pt(%s)' % z_ij.name pt[-1].append(cur_pt) off_pt = T.exp(t - mx) off_pt.name = 'p_tilde_off(%s)' % z_name denom = off_pt for i in xrange(r): for j in xrange(c): denom = denom + pt[i][j] denom.name = 'denom(%s)' % z_name off_prob = off_pt / denom p = 1. - off_prob p.name = 'p(%s)' % z_name hpart = [] for i in xrange(r): hpart.append([pt_ij / denom for pt_ij in pt[i]]) h = T.alloc(0., batch_size, zr, zc, ch) for i in xrange(r): for j in xrange(c): h = T.set_subtensor(h[:, i:zr:r, j:zc:c, :], hpart[i][j]) h.name = 'h(%s)' % z_name if theano_rng is None: return p, h else: events = [] for i in xrange(r): for j in xrange(c): events.append(hpart[i][j]) events.append(off_prob) events = [event.dimshuffle(0, 1, 2, 3, 'x') for event in events] events = tuple(events) stacked_events = T.concatenate(events, axis=4) batch_size, rows, cols, channels, outcomes = stacked_events.shape reshaped_events = stacked_events.reshape( (batch_size * rows * cols * channels, outcomes)) multinomial = theano_rng.multinomial(pvals=reshaped_events, dtype=p.dtype) reshaped_multinomial = multinomial.reshape( (batch_size, rows, cols, channels, outcomes)) h_sample = T.alloc(0., batch_size, zr, zc, ch) idx = 0 for i in xrange(r): for j in xrange(c): h_sample = T.set_subtensor( h_sample[:, i:zr:r, j:zc:c, :], reshaped_multinomial[:, :, :, :, idx]) idx += 1 p_sample = 1 - reshaped_multinomial[:, :, :, :, -1] return p, h, p_sample, h_sample
def __init__(self, input, n_in, n_hidden, n_out, activation=T.tanh, output_type='real'): self.input = input self.activation = activation self.output_type = output_type self.batch_size = T.iscalar() # theta is a vector of all trainable parameters # it represents the value of W, W_in, W_out, h0, bh, by theta_shape = n_hidden ** 2 + n_in * n_hidden + n_hidden * n_out + \ n_hidden + n_hidden + n_out self.theta = theano.shared( value=np.zeros(theta_shape, dtype=theano.config.floatX)) # Parameters are reshaped views of theta param_idx = 0 # pointer to somewhere along parameter vector # recurrent weights as a shared variable self.W = self.theta[param_idx:(param_idx + n_hidden**2)].reshape( (n_hidden, n_hidden)) self.W.name = 'W' W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_hidden**2 # input to hidden layer weights self.W_in = self.theta[param_idx:(param_idx + n_in * \ n_hidden)].reshape((n_in, n_hidden)) self.W_in.name = 'W_in' W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_in * n_hidden # hidden to output layer weights self.W_out = self.theta[param_idx:(param_idx + n_hidden * \ n_out)].reshape((n_hidden, n_out)) self.W_out.name = 'W_out' W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_hidden * n_out self.h0 = self.theta[param_idx:(param_idx + n_hidden)] self.h0.name = 'h0' h0_init = np.zeros((n_hidden, ), dtype=theano.config.floatX) param_idx += n_hidden self.bh = self.theta[param_idx:(param_idx + n_hidden)] self.bh.name = 'bh' bh_init = np.zeros((n_hidden, ), dtype=theano.config.floatX) param_idx += n_hidden self.by = self.theta[param_idx:(param_idx + n_out)] self.by.name = 'by' by_init = np.zeros((n_out, ), dtype=theano.config.floatX) param_idx += n_out assert (param_idx == theta_shape) # for convenience self.params = [ self.W, self.W_in, self.W_out, self.h0, self.bh, self.by ] # shortcut to norms (for monitoring) self.l2_norms = {} for param in self.params: self.l2_norms[param] = T.sqrt(T.sum(param**2)) # initialize parameters # DEBUG_MODE gives division by zero error when we leave parameters # as zeros self.theta.set_value( np.concatenate([ x.ravel() for x in (W_init, W_in_init, W_out_init, h0_init, bh_init, by_init) ])) self.theta_update = theano.shared( value=np.zeros(theta_shape, dtype=theano.config.floatX)) # recurrent function (using tanh activation function) and linear output # activation function def step(x_t, h_tm1): h_t = self.activation(T.dot(x_t, self.W_in) + \ T.dot(h_tm1, self.W) + self.bh) y_t = T.dot(h_t, self.W_out) + self.by return h_t, y_t # the hidden state `h` for the entire sequence, and the output for the # entire sequence `y` (first dimension is always time) # Note the implementation of weight-sharing h0 across variable-size # batches using T.ones multiplying h0 [self.h, self.y_pred], _ = theano.scan(step, sequences=self.input, outputs_info=[ T.alloc(self.h0, self.input.shape[1], n_hidden), None ]) # outputs_info=[T.ones(shape=(self.input.shape[1], # self.h0.shape[0])) * self.h0, None]) # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = 0 self.L1 += abs(self.W.sum()) self.L1 += abs(self.W_in.sum()) self.L1 += abs(self.W_out.sum()) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = 0 self.L2_sqr += (self.W**2).sum() self.L2_sqr += (self.W_in**2).sum() self.L2_sqr += (self.W_out**2).sum() if self.output_type == 'real': self.loss = lambda y: self.mse(y) elif self.output_type == 'binary': # push through sigmoid self.p_y_given_x = T.nnet.sigmoid(self.y_pred) # apply sigmoid self.y_out = T.round(self.p_y_given_x) # round to {0,1} self.loss = lambda y: self.nll_binary(y) elif self.output_type == 'softmax': # push through softmax, computing vector of class-membership # probabilities in symbolic form # # T.nnet.softmax will not operate on T.tensor3 types, only matrices # We take our n_steps x n_seq x n_classes output from the net # and reshape it into a (n_steps * n_seq) x n_classes matrix # apply softmax, then reshape back y_p = self.y_pred y_p_m = T.reshape(y_p, (y_p.shape[0] * y_p.shape[1], -1)) y_p_s = T.nnet.softmax(y_p_m) self.p_y_given_x = T.reshape(y_p_s, y_p.shape) # compute prediction as class whose probability is maximal self.y_out = T.argmax(self.p_y_given_x, axis=-1) self.loss = lambda y: self.nll_multiclass(y) else: raise NotImplementedError
def max_pool(z, pool_shape, top_down=None, theano_rng=None): """ .. todo:: WRITEME properly z : a theano 4-tensor representing input from below pool_shape: tuple of ints. the shape of regions to be pooled top_down: (optional) a theano 4-tensor representing input from above if None, assumes top-down input is 0 theano_rng: (optional) a MRG_RandomStreams instance returns: a theano 4-tensor for the expected value of the detector layer h a theano 4-tensor for the expected value of the pooling layer p if theano_rng is not None, also returns: a theano 4-tensor of samples of the detector layer a theano 4-tensor of samples of the pooling layer all 4-tensors are formatted with axes ('b', 'c', 0, 1). This is for maximum speed when using theano's conv2d to generate z and top_down, or when using it to infer conditionals of other layers using the return values. Detailed description: Suppose you have a variable h that lives in a Conv2DSpace h_space and you want to pool it down to a variable p that lives in a smaller Conv2DSpace p. This function does that, using non-overlapping pools. Specifically, consider one channel of h. h must have a height that is a multiple of pool_shape[0] and a width that is a multiple of pool_shape[1]. A channel of h can thus be broken down into non-overlapping rectangles of shape pool_shape. Now consider one rectangular pooled region within one channel of h. I now use 'h' to refer just to this rectangle, and 'p' to refer to just the one pooling unit associated with that rectangle. We assume that the space that h and p live in is constrained such that h and p are both binary and p = max(h). To reduce the state-space in order to make probabilistic computations cheaper we also constrain sum(h) <= 1. Suppose h contains k different units. Suppose that the only term in the model's energy function involving h is -(z*h).sum() (elemwise multiplication) and the only term in the model's energy function involving p is -(top_down*p).sum(). Then P(h[i] = 1) = softmax( [ z[1], z[2], ..., z[k], -top_down] )[i] and P(p = 1) = 1-softmax( [z[1], z[2], ..., z[k], -top_down])[k] This variation of the function assumes that z, top_down, and all return values use Conv2D axes ('b', 'c', 0, 1). This variation of the function implements the softmax using a theano graph of exp, maximum, sub, and div operations. Performance notes: It might be possible to make a faster implementation with different theano ops. rather than using set_subtensor, it might be possible to use the stuff in theano.sandbox.neighbours. Probably not possible, or at least nasty, because that code isn't written with multiple channels in mind, and I don't think just a reshape can fix it. Some work on this in galatea.cond.neighbs.py At some point images2neighbs' gradient was broken so check that it has been fixed before sinking too much time into this. Stabilizing the softmax is also another source of slowness. Here it is stabilized with several calls to maximum and sub. It might also be possible to stabilize it with T.maximum(-top_down,T.signal.downsample.max_pool(z)). Don't know if that would be faster or slower. Elsewhere in this file I implemented the softmax with a reshape and call to Softmax / SoftmaxWithBias. This is slower, even though Softmax is faster on the GPU than the equivalent max/sub/exp/div graph. Maybe the reshape is too expensive. Benchmarks show that most of the time is spent in GpuIncSubtensor when running on gpu. So it is mostly that which needs a faster implementation. One other way to implement this would be with a linear.Conv2D.lmul_T, where the convolution stride is equal to the pool width, and the thing to multiply with is the hparts stacked along the channel axis. Unfortunately, conv2D doesn't work right with stride > 2 and is pretty slow for stride 2. Conv3D is used to mitigat some of this, but only has CPU code. """ z_name = z.name if z_name is None: z_name = 'anon_z' batch_size, ch, zr, zc = z.shape r, c = pool_shape zpart = [] mx = None if top_down is None: t = 0. else: t = -top_down t.name = 'neg_top_down' for i in xrange(r): zpart.append([]) for j in xrange(c): cur_part = z[:, :, i:zr:r, j:zc:c] if z_name is not None: cur_part.name = z_name + '[%d,%d]' % (i, j) zpart[i].append(cur_part) if mx is None: mx = T.maximum(t, cur_part) if cur_part.name is not None: mx.name = 'max(-top_down,' + cur_part.name + ')' else: max_name = None if cur_part.name is not None: mx_name = 'max(' + cur_part.name + ',' + mx.name + ')' mx = T.maximum(mx, cur_part) mx.name = mx_name mx.name = 'local_max(' + z_name + ')' pt = [] for i in xrange(r): pt.append([]) for j in xrange(c): z_ij = zpart[i][j] safe = z_ij - mx safe.name = 'safe_z(%s)' % z_ij.name cur_pt = T.exp(safe) cur_pt.name = 'pt(%s)' % z_ij.name pt[-1].append(cur_pt) off_pt = T.exp(t - mx) off_pt.name = 'p_tilde_off(%s)' % z_name denom = off_pt for i in xrange(r): for j in xrange(c): denom = denom + pt[i][j] denom.name = 'denom(%s)' % z_name off_prob = off_pt / denom p = 1. - off_prob p.name = 'p(%s)' % z_name hpart = [] for i in xrange(r): hpart.append([pt_ij / denom for pt_ij in pt[i]]) h = T.alloc(0., batch_size, ch, zr, zc) for i in xrange(r): for j in xrange(c): h.name = 'h_interm' h = T.set_subtensor(h[:, :, i:zr:r, j:zc:c], hpart[i][j]) h.name = 'h(%s)' % z_name if theano_rng is None: return p, h else: events = [] for i in xrange(r): for j in xrange(c): events.append(hpart[i][j]) events.append(off_prob) events = [event.dimshuffle(0, 1, 2, 3, 'x') for event in events] events = tuple(events) stacked_events = T.concatenate(events, axis=4) rows = zr // pool_shape[0] cols = zc // pool_shape[1] outcomes = pool_shape[0] * pool_shape[1] + 1 assert stacked_events.ndim == 5 for se, bs, r, c, chv in get_debug_values(stacked_events, batch_size, rows, cols, ch): assert se.shape[0] == bs assert se.shape[1] == r assert se.shape[2] == c assert se.shape[3] == chv assert se.shape[4] == outcomes reshaped_events = stacked_events.reshape( (batch_size * rows * cols * ch, outcomes)) multinomial = theano_rng.multinomial(pvals=reshaped_events, dtype=p.dtype) reshaped_multinomial = multinomial.reshape( (batch_size, ch, rows, cols, outcomes)) h_sample = T.alloc(0., batch_size, ch, zr, zc) idx = 0 for i in xrange(r): for j in xrange(c): h_sample = T.set_subtensor( h_sample[:, :, i:zr:r, j:zc:c], reshaped_multinomial[:, :, :, :, idx]) idx += 1 p_sample = 1 - reshaped_multinomial[:, :, :, :, -1] return p, h, p_sample, h_sample
def define_layers(self): self.layers = [] self.params = [] for i in xrange(self.num_hds): if i == 0: layer_input = self.X h_shape = (self.out_size, self.hidden_size_list[0]) else: layer_input = self.layers[i - 1].activation h_shape = (self.hidden_size_list[i - 1], self.hidden_size_list[i]) if self.cell == "gru": hidden_layer = GRULayer(self.rng, self.prefix + self.layer_id + str(i), h_shape, layer_input, self.mask, self.is_train, self.batch_size, self.drop_rate) elif self.cell == "lstm": hidden_layer = LSTMLayer(self.rng, self.prefix + self.layer_id + str(i), h_shape, layer_input, self.mask, self.is_train, self.batch_size, self.drop_rate) self.layers.append(hidden_layer) self.params += hidden_layer.params #the last decoder layer for decoding if self.num_hds == 0: output_layer_input = self.X last_shape = (self.in_size, self.out_size) else: output_layer_input = self.layers[-1].activation last_shape = (self.in_size, self.layers[-1].out_size) self.W_hy = init_weights((last_shape[1], last_shape[0]), self.prefix + "W_hy" + self.layer_id) self.b_y = init_bias(last_shape[0], self.prefix + "b_y" + self.layer_id) if self.cell == "gru": self.decoder = GRULayer(self.rng, self.prefix + self.layer_id, last_shape, output_layer_input, self.mask, self.is_train, self.batch_size, self.drop_rate) def _active(m, pre_h, x): x = T.reshape(x, (self.batch_size, last_shape[0])) pre_h = T.reshape(pre_h, (self.batch_size, last_shape[1])) h = self.decoder._active(x, pre_h) y = T.nnet.softmax(T.dot(h, self.W_hy) + self.b_y) y = y * m[:, None] h = T.reshape(h, (1, self.batch_size * last_shape[1])) y = T.reshape(y, (1, self.batch_size * last_shape[0])) return h, y [h, y], updates = theano.scan( _active, #n_steps = self.words, sequences=[self.mask], outputs_info=[{ 'initial': output_layer_input, 'taps': [-1] }, T.alloc(floatX(0.), 1, self.batch_size * last_shape[0])]) elif self.cell == "lstm": self.decoder = LSTMLayer(self.rng, self.prefix + self.layer_id, last_shape, output_layer_input, self.mask, self.is_train, self.batch_size, self.drop_rate) def _active(m, pre_h, pre_c, x): x = T.reshape(x, (self.batch_size, last_shape[0])) pre_h = T.reshape(pre_h, (self.batch_size, last_shape[1])) pre_c = T.reshape(pre_c, (self.batch_size, last_shape[1])) h, c = self.decoder._active(x, pre_h, pre_c) y = T.nnet.softmax(T.dot(h, self.W_hy) + self.b_y) y = y * m[:, None] h = T.reshape(h, (1, self.batch_size * last_shape[1])) c = T.reshape(c, (1, self.batch_size * last_shape[1])) y = T.reshape(y, (1, self.batch_size * last_shape[0])) return h, c, y [h, c, y], updates = theano.scan( _active, sequences=[self.mask], outputs_info=[{ 'initial': output_layer_input, 'taps': [-1] }, { 'initial': output_layer_input, 'taps': [-1] }, T.alloc(floatX(0.), 1, self.batch_size * last_shape[0])]) y = T.reshape(y, (self.words, self.batch_size * last_shape[0])) self.activation = y self.params += self.decoder.params self.params += [self.W_hy, self.b_y] # self.layers.append(self.decoder) self.hhhh = h
def build_sampler(tparams, options, trng): x = tensor.tensor3('x', dtype='float32') xr = x[::-1] n_timesteps = x.shape[0] n_samples = x.shape[1] # word embedding (source), forward and backward h=x hr=xr hidden_sizes=options['dim_enc'] for i in range(len(hidden_sizes)): proj = get_layer(options['encoder'])[1](tparams, h, options, prefix='encoder'+str(i)) # word embedding for backward rnn (source) projr = get_layer(options['encoder'])[1](tparams, hr, options, prefix='encoder_r'+str(i)) h=concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) if options['down_sample'][i]==1: h=h[0::2] hr=h[::-1] ctx = h # get the input for decoder rnn initializer mlp ctx_mean = ctx.mean(0) # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2) init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') print('Building f_init...',) outs = [init_state, ctx] f_init = theano.function([x], outs, name='f_init', profile=profile) print('Done') # x: 1 x 1 y = tensor.vector('y_sampler', dtype='int64') init_state = tensor.matrix('init_state', dtype='float32') alpha_past = tensor.matrix('alpha_past', dtype='float32') # if it's the first word, emb should be all zero and it is indicated by -1 emb = tensor.switch(y[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), tparams['Wemb_dec'][y]) # apply one step of conditional gru with attention proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state, alpha_past = alpha_past) # get the next hidden state next_state = proj[0] # get the weighted averages of context for this target word y ctxs = proj[1] next_alpha_past = proj[3] logit_lstm = get_layer('ff')[1](tparams, next_state, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = logit_lstm+logit_prev+logit_ctx # maxout layer shape = logit.shape shape1 = tensor.cast(shape[1] // 2, 'int64') shape2 = tensor.cast(2, 'int64') logit = logit.reshape([shape[0], shape1, shape2]) # batch*256 -> batch*128*2 logit=logit.max(2) # batch*500 logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') # compute the softmax probability next_probs = tensor.nnet.softmax(logit) # sample from softmax distribution to get the sample next_sample = trng.multinomial(pvals=next_probs).argmax(1) # compile a function to do the whole thing above, next word probability, # sampled word for the next target, next hidden state to be used print('Building f_next..') inps = [y, ctx, init_state, alpha_past] outs = [next_probs, next_sample, next_state, next_alpha_past] f_next = theano.function(inps, outs, name='f_next', profile=profile, on_unused_input='ignore') print('Done') return f_init, f_next
def max_pool_channels(z, pool_size, top_down=None, theano_rng=None): """ .. todo:: WRITEME properly Unlike Honglak's convolutional max pooling, which pools over spatial locations within each channels, this does max pooling in a densely connected model. Here we pool groups of channels together. z : a theano matrix representing a batch of input from below pool_size: int. the number of features to combine into one pooled unit top_down: (optional) a theano matrix representing input from above if None, assumes top-down input is 0 theano_rng: (optional) a MRG_RandomStreams instance returns: a theano matrix for the expected value of the detector layer h a theano matrix for the expected value of the pooling layer p if theano_rng is not None, also returns: a theano matrix of samples of the detector layer a theano matrix of samples of the pooling layer all matrices are formatted as (num_example, num_features) """ z_name = z.name if z_name is None: z_name = 'anon_z' if pool_size == 1: if top_down is None: top_down = 0. total_input = z + top_down p = T.nnet.sigmoid(total_input) h = p if theano_rng is None: return p, h else: t1 = time.time() p_samples = theano_rng.binomial(p=p, size=p.shape, dtype=p.dtype, n=1) t2 = time.time() if t2 - t1 > 0.5: warnings.warn("TODO: speed up theano's random number seeding. " "max pooling spent " + str(t2 - t1) + "in a call to theano_rng.binomial.") h_samples = p_samples return p_samples, h_samples, p_samples, h_samples else: batch_size, n = z.shape mx = None if top_down is None: t = 0. else: t = -top_down t.name = 'neg_top_down' zpart = [] for i in xrange(pool_size): cur_part = z[:, i:n:pool_size] if z_name is not None: cur_part.name = z_name + '[%d]' % (i) zpart.append(cur_part) if mx is None: mx = T.maximum(t, cur_part) if cur_part.name is not None: mx.name = 'max(-top_down,' + cur_part.name + ')' else: max_name = None if cur_part.name is not None: mx_name = 'max(' + cur_part.name + ',' + mx.name + ')' mx = T.maximum(mx, cur_part) mx.name = mx_name mx.name = 'local_max(' + z_name + ')' pt = [] for i in xrange(pool_size): z_i = zpart[i] safe = z_i - mx safe.name = 'safe_z(%s)' % z_i.name cur_pt = T.exp(safe) cur_pt.name = 'pt(%s)' % z_i.name assert cur_pt.ndim == 2 pt.append(cur_pt) off_pt = T.exp(t - mx) assert off_pt.ndim == 2 off_pt.name = 'p_tilde_off(%s)' % z_name denom = off_pt for i in xrange(pool_size): denom = denom + pt[i] assert denom.ndim == 2 denom.name = 'denom(%s)' % z_name off_prob = off_pt / denom p = 1. - off_prob assert p.dtype == z.dtype hpart = [pt_i / denom for pt_i in pt] h = T.alloc(0., batch_size, n) for i in xrange(pool_size): h.name = 'h_interm' hp = hpart[i] sub_h = h[:, i:n:pool_size] assert sub_h.ndim == 2 assert hp.ndim == 2 for hv, hsv, hpartv in get_debug_values(h, sub_h, hp): print hv.shape print hsv.shape print hpartv.shape h = T.set_subtensor(sub_h, hp) p.name = 'p(%s)' % z_name h.name = 'h(%s)' % z_name if theano_rng is None: return p, h else: events = [] for i in xrange(pool_size): events.append(hpart[i]) events.append(off_prob) events = [event.dimshuffle(0, 1, 'x') for event in events] events = tuple(events) stacked_events = T.concatenate(events, axis=2) outcomes = pool_size + 1 reshaped_events = stacked_events.reshape( (batch_size * n // pool_size, outcomes)) t1 = time.time() multinomial = theano_rng.multinomial(pvals=reshaped_events, dtype=p.dtype) t2 = time.time() if t2 - t1 > 0.5: warnings.warn("TODO: speed up theano's random number seeding." "max pooling spent " + str(t2 - t1) + " in a call to theano_rng.multinomial.") reshaped_multinomial = multinomial.reshape( (batch_size, n // pool_size, outcomes)) h_sample = T.zeros_like(z) idx = 0 for i in xrange(pool_size): h_sample = T.set_subtensor(h_sample[:, i:n:pool_size], reshaped_multinomial[:, :, idx]) idx += 1 p_sample = 1 - reshaped_multinomial[:, :, -1] assert h_sample.dtype == z.dtype return p, h, p_sample, h_sample
def train(self, savefile, task, recover=True): """ Train the RNN. Parameters ---------- savefile : str task : function recover : bool, optional If `True`, will attempt to recover from a previously saved run. """ N = self.p['N'] Nin = self.p['Nin'] Nout = self.p['Nout'] alpha = self.p['dt']/self.p['tau'] # Initialize settings settings = OrderedDict() # Check if file already exists if not recover: if os.path.isfile(savefile): os.remove(savefile) #--------------------------------------------------------------------------------- # Are we using GPUs? #--------------------------------------------------------------------------------- if theanotools.get_processor_type() == 'gpu': settings['GPU'] = 'enabled' else: settings['GPU'] = 'no' #--------------------------------------------------------------------------------- # Random number generator #--------------------------------------------------------------------------------- settings['init seed'] = self.p['seed'] rng = np.random.RandomState(self.p['seed']) #--------------------------------------------------------------------------------- # Weight initialization #--------------------------------------------------------------------------------- settings['distribution (Win)'] = self.p['distribution_in'] settings['distribution (Wrec)'] = self.p['distribution_rec'] settings['distribution (Wout)'] = self.p['distribution_out'] if Nin > 0: Win_0 = self.init_weights(rng, self.p['Cin'], N, Nin, self.p['distribution_in']) Wrec_0 = self.init_weights(rng, self.p['Crec'], N, N, self.p['distribution_rec']) Wout_0 = self.init_weights(rng, self.p['Cout'], Nout, N, self.p['distribution_out']) #--------------------------------------------------------------------------------- # Enforce Dale's law on the initial weights #--------------------------------------------------------------------------------- settings['Nin/N/Nout'] = '{}/{}/{}'.format(Nin, N, Nout) if self.p['ei'] is not None: Nexc = len(np.where(self.p['ei'] > 0)[0]) Ninh = len(np.where(self.p['ei'] < 0)[0]) settings['Dale\'s law'] = 'E/I = {}/{}'.format(Nexc, Ninh) if Nin > 0: Win_0 = abs(Win_0) # If Dale, assume inputs are excitatory Wrec_0 = abs(Wrec_0) Wout_0 = abs(Wout_0) else: settings['Dale\'s law'] = 'no' #--------------------------------------------------------------------------------- # Fix spectral radius #--------------------------------------------------------------------------------- # Compute spectral radius C = self.p['Crec'] if C is not None: Wrec_0_full = C.mask_plastic*Wrec_0 + C.mask_fixed else: Wrec_0_full = Wrec_0 if self.p['ei'] is not None: Wrec_0_full = Wrec_0_full*self.p['ei'] rho = RNN.spectral_radius(Wrec_0_full) # Scale Wrec to have fixed spectral radius if self.p['ei'] is not None: R = self.p['rho0']/rho else: R = 1.1/rho Wrec_0 *= R if C is not None: C.mask_fixed *= R # Check spectral radius if C is not None: Wrec_0_full = C.mask_plastic*Wrec_0 + C.mask_fixed else: Wrec_0_full = Wrec_0 if self.p['ei'] is not None: Wrec_0_full = Wrec_0_full*self.p['ei'] rho = RNN.spectral_radius(Wrec_0_full) settings['initial spectral radius'] = '{:.2f}'.format(rho) #--------------------------------------------------------------------------------- # Others #--------------------------------------------------------------------------------- brec_0 = self.p['brec']*np.ones(N) bout_0 = self.p['bout']*np.ones(Nout) x0_0 = self.p['x0']*np.ones(N) #--------------------------------------------------------------------------------- # RNN parameters #--------------------------------------------------------------------------------- if Nin > 0: Win = theanotools.shared(Win_0, name='Win') else: Win = None Wrec = theanotools.shared(Wrec_0, name='Wrec') Wout = theanotools.shared(Wout_0, name='Wout') brec = theanotools.shared(brec_0, name='brec') bout = theanotools.shared(bout_0, name='bout') x0 = theanotools.shared(x0_0, name='x0') #--------------------------------------------------------------------------------- # Parameters to train #--------------------------------------------------------------------------------- trainables = [] if Win is not None: trainables += [Win] trainables += [Wrec] if Wout is not None: trainables += [Wout] if self.p['train_brec']: settings['train recurrent bias'] = 'yes' trainables += [brec] else: settings['train recurrent bias'] = 'no' if self.p['train_bout']: settings['train output bias'] = 'yes' trainables += [bout] else: settings['train output bias'] = 'no' # In continuous mode it doesn't make sense to train x0, which is forgotten if self.p['mode'] == 'continuous': self.p['train_x0'] = False if self.p['train_x0']: settings['train initial conditions'] = 'yes' trainables += [x0] else: settings['train initial conditions'] = 'no' #--------------------------------------------------------------------------------- # Weight matrices #--------------------------------------------------------------------------------- # Input if Nin > 0: if self.p['Cin'] is not None: C = self.p['Cin'] settings['sparseness (Win)'] = ('p = {:.2f}, p_plastic = {:.2f}' .format(C.p, C.p_plastic)) Cin_mask_plastic = theanotools.shared(C.mask_plastic) Cin_mask_fixed = theanotools.shared(C.mask_fixed) Win_ = Cin_mask_plastic*Win + Cin_mask_fixed Win_.name = 'Win_' else: Win_ = Win # Recurrent if self.p['Crec'] is not None: C = self.p['Crec'] settings['sparseness (Wrec)'] = ('p = {:.2f}, p_plastic = {:.2f}' .format(C.p, C.p_plastic)) Crec_mask_plastic = theanotools.shared(C.mask_plastic) Crec_mask_fixed = theanotools.shared(C.mask_fixed) Wrec_ = Crec_mask_plastic*Wrec + Crec_mask_fixed Wrec_.name = 'Wrec_' else: Wrec_ = Wrec # Output if self.p['Cout'] is not None: C = self.p['Cout'] settings['sparseness (Wout)'] = ('p = {:.2f}, p_plastic = {:.2f}' .format(C.p, C.p_plastic)) Cout_mask_plastic = theanotools.shared(C.mask_plastic) Cout_mask_fixed = theanotools.shared(C.mask_fixed) Wout_ = Cout_mask_plastic*Wout + Cout_mask_fixed Wout_.name = 'Wout_' else: Wout_ = Wout #--------------------------------------------------------------------------------- # Dale's law #--------------------------------------------------------------------------------- if self.p['ei'] is not None: # Function to keep matrix elements positive if self.p['ei_positive_func'] == 'abs': settings['E/I positivity function'] = 'absolute value' make_positive = abs elif self.p['ei_positive_func'] == 'rectify': settings['E/I positivity function'] = 'rectify' make_positive = theanotools.rectify else: raise ValueError("Unknown ei_positive_func.") # Assume inputs are excitatory if Nin > 0: Win_ = make_positive(Win_) # E/I ei = theanotools.shared(self.p['ei'], name='ei') Wrec_ = make_positive(Wrec_)*ei Wout_ = make_positive(Wout_)*ei #--------------------------------------------------------------------------------- # Variables to save #--------------------------------------------------------------------------------- if Nin > 0: save_values = [Win_] else: save_values = [None] save_values += [Wrec_, Wout_, brec, bout, x0] #--------------------------------------------------------------------------------- # Activation functions #--------------------------------------------------------------------------------- f_hidden, d_f_hidden = theanotools.hidden_activations[self.p['hidden_activation']] settings['hidden activation'] = self.p['hidden_activation'] act = self.p['output_activation'] f_output = theanotools.output_activations[act] if act == 'sigmoid': settings['output activation/loss'] = 'sigmoid/binary cross entropy' f_loss = theanotools.binary_crossentropy elif act == 'softmax': settings['output activation/loss'] = 'softmax/categorical cross entropy' f_loss = theanotools.categorical_crossentropy else: settings['output activation/loss'] = act + '/squared' f_loss = theanotools.L2 #--------------------------------------------------------------------------------- # RNN #--------------------------------------------------------------------------------- # Dims: time, trials, units # u[:,:,:Nin] contains the inputs (including baseline and noise), # u[:,:,Nin:] contains the recurrent noise u = T.tensor3('u') x0_ = T.alloc(x0, u.shape[1], x0.shape[0]) if Nin > 0: def rnn(u_t, x_tm1, r_tm1, WinT, WrecT): x_t = ((1 - alpha)*x_tm1 + alpha*(T.dot(r_tm1, WrecT) # Recurrent + brec # Bias + T.dot(u_t[:,:Nin], WinT) # Input + u_t[:,Nin:]) # Recurrent noise ) r_t = f_hidden(x_t) return [x_t, r_t] [x, r], _ = theano.scan(fn=rnn, outputs_info=[x0_, f_hidden(x0_)], sequences=u, non_sequences=[Win_.T, Wrec_.T]) else: def rnn(u_t, x_tm1, r_tm1, WrecT): x_t = ((1 - alpha)*x_tm1 + alpha*(T.dot(r_tm1, WrecT) # Recurrent + brec # Bias + u_t[:,Nin:]) # Recurrent noise ) r_t = f_hidden(x_t) return [x_t, r_t] [x, r], _ = theano.scan(fn=rnn, outputs_info=[x0_, f_hidden(x0_)], sequences=u, non_sequences=[Wrec_.T]) #--------------------------------------------------------------------------------- # Running mode #--------------------------------------------------------------------------------- if self.p['mode'] == 'continuous': settings['mode'] = 'continuous' if self.p['n_gradient'] != 1: print("[ Trainer.train ] In continuous mode," " so we're setting n_gradient to 1.") self.p['n_gradient'] = 1 x0_ = x[-1] else: settings['mode'] = 'batch' #--------------------------------------------------------------------------------- # Readout #--------------------------------------------------------------------------------- z = f_output(T.dot(r, Wout_.T) + bout) #--------------------------------------------------------------------------------- # Deduce whether the task specification contains an output mask -- use a # temporary dataset so it doesn't affect the training. #--------------------------------------------------------------------------------- dataset = Dataset(1, task, self.floatX, self.p, name='gradient') if dataset.has_output_mask(): settings['output mask'] = 'yes' else: settings['output mask'] = 'no' #--------------------------------------------------------------------------------- # Loss #--------------------------------------------------------------------------------- # (time, trials, outputs) target = T.tensor3('target') # Set mask mask = target[:,:,Nout:] masknorm = T.sum(mask) # Input-output pairs inputs = [u, target] # target[:,:,:Nout] contains the target outputs, & # target[:,:,Nout:] contains the mask. # Loss, not including the regularization terms loss = T.sum(f_loss(z, target[:,:,:Nout])*mask)/masknorm # Root-mean-squared error error = T.sqrt(T.sum(theanotools.L2(z, target[:,:,:Nout])*mask)/masknorm) #--------------------------------------------------------------------------------- # Regularization terms #--------------------------------------------------------------------------------- regs = 0 #--------------------------------------------------------------------------------- # L1 weight regularization #--------------------------------------------------------------------------------- lambda1 = self.p['lambda1_in'] if lambda1 > 0: settings['L1 weight regularization (Win)'] = ('lambda1_in = {}' .format(lambda1)) regs += lambda1 * T.mean(abs(Win)) lambda1 = self.p['lambda1_rec'] if lambda1 > 0: settings['L1 weight regularization (Wrec)'] = ('lambda1_rec = {}' .format(lambda1)) regs += lambda1 * T.mean(abs(Wrec)) lambda1 = self.p['lambda1_out'] if lambda1 > 0: settings['L1 weight regularization (Wout)'] = ('lambda1_out = {}' .format(lambda1)) regs += lambda1 * T.mean(abs(Wout)) #--------------------------------------------------------------------------------- # L2 weight regularization #--------------------------------------------------------------------------------- if Nin > 0: lambda2 = self.p['lambda2_in'] if lambda2 > 0: settings['L2 weight regularization (Win)'] = ('lambda2_in = {}' .format(lambda2)) regs += lambda2 * T.mean(Win**2) lambda2 = self.p['lambda2_rec'] if lambda2 > 0: settings['L2 weight regularization (Wrec)'] = ('lambda2_rec = {}' .format(lambda2)) regs += lambda2 * T.mean(Wrec**2) lambda2 = self.p['lambda2_out'] if lambda2 > 0: settings['L2 weight regularization (Wout)'] = ('lambda2_out = {}' .format(lambda2)) regs += lambda2 * T.mean(Wout**2) #--------------------------------------------------------------------------------- # L2 rate regularization #--------------------------------------------------------------------------------- lambda2 = self.p['lambda2_r'] if lambda2 > 0: settings['L2 rate regularization'] = 'lambda2_r = {}'.format(lambda2) regs += lambda2 * T.mean(r**2) #--------------------------------------------------------------------------------- # Final costs #--------------------------------------------------------------------------------- costs = [loss, error] #--------------------------------------------------------------------------------- # Datasets #--------------------------------------------------------------------------------- gradient_data = Dataset(self.p['n_gradient'], task, self.floatX, self.p, batch_size=self.p['gradient_batch_size'], seed=self.p['gradient_seed'], name='gradient') validation_data = Dataset(self.p['n_validation'], task, self.floatX, self.p, batch_size=self.p['validation_batch_size'], seed=self.p['validation_seed'], name='validation') # Input noise if np.isscalar(self.p['var_in']): if Nin > 0: settings['sigma_in'] = '{}'.format(np.sqrt(self.p['var_in'])) else: settings['sigma_in'] = 'array' # Recurrent noise if np.isscalar(self.p['var_rec']): settings['sigma_rec'] = '{}'.format(np.sqrt(self.p['var_rec'])) else: settings['sigma_rec'] = 'array' # Dataset settings settings['rectify inputs'] = self.p['rectify_inputs'] settings['gradient minibatch size'] = gradient_data.minibatch_size settings['validation minibatch size'] = validation_data.minibatch_size #--------------------------------------------------------------------------------- # Other settings #--------------------------------------------------------------------------------- settings['dt'] = '{} ms'.format(self.p['dt']) if np.isscalar(self.p['tau']): settings['tau'] = '{} ms'.format(self.p['tau']) else: settings['tau'] = 'custom' settings['tau_in'] = '{} ms'.format(self.p['tau_in']) settings['learning rate'] = '{}'.format(self.p['learning_rate']) settings['lambda_Omega'] = '{}'.format(self.p['lambda_Omega']) settings['max gradient norm'] = '{}'.format(self.p['max_gradient_norm']) #--------------------------------------------------------------------------------- # A few important Theano settings #--------------------------------------------------------------------------------- settings['(Theano) floatX'] = self.floatX settings['(Theano) allow_gc'] = theano.config.allow_gc #--------------------------------------------------------------------------------- # Train! #--------------------------------------------------------------------------------- print_settings(settings) sgd = SGD(trainables, inputs, costs, regs, x, z, self.p, save_values, {'Wrec_': Wrec_, 'd_f_hidden': d_f_hidden}) sgd.train(gradient_data, validation_data, savefile)
def max_pool_c01b(z, pool_shape, top_down=None, theano_rng=None): """ .. todo:: WRITEME properly Like max_pool but with all 4-tensors formatted with axes ('c', 0, 1, 'b'). This is for maximum speed when using-cuda convnet. Performance notes: Stabilizing the softmax is one source slowness. Here it is stabilized with several calls to maximum and sub. It might also be possible to stabilize it with T.maximum(-top_down,<cuda convnet max pooling>). Don't know if that would be faster or slower. Benchmarks show that most of the time is spent in GpuIncSubtensor when running on gpu. So it is mostly that which needs a faster implementation. One other way to implement this would be with cuda convnet convolution, where the convolution stride is equal to the pool width, and the thing to multiply with is the hparts stacked along the channel axis. This isn't a feasible solution for max_pool because of theano convolution's poor support for strides, but for cuda convnet it could give a speedup. """ z_name = z.name if z_name is None: z_name = 'anon_z' ch, zr, zc, batch_size = z.shape r, c = pool_shape zpart = [] mx = None if top_down is None: t = 0. else: t = -top_down t.name = 'neg_top_down' for i in xrange(r): zpart.append([]) for j in xrange(c): cur_part = z[:, i:zr:r, j:zc:c, :] if z_name is not None: cur_part.name = z_name + '[%d, %d]' % (i, j) zpart[i].append(cur_part) if mx is None: mx = T.maximum(t, cur_part) if cur_part.name is not None: mx.name = 'max(-top_down,' + cur_part.name + ')' else: max_name = None if cur_part.name is not None: mx_name = 'max(' + cur_part.name + ',' + mx.name + ')' mx = T.maximum(mx, cur_part) mx.name = mx_name mx.name = 'local_max(' + z_name + ')' pt = [] for i in xrange(r): pt.append([]) for j in xrange(c): z_ij = zpart[i][j] safe = z_ij - mx safe.name = 'safe_z(%s)' % z_ij.name cur_pt = T.exp(safe) cur_pt.name = 'pt(%s)' % z_ij.name pt[-1].append(cur_pt) off_pt = T.exp(t - mx) off_pt.name = 'p_tilde_off(%s)' % z_name denom = off_pt for i in xrange(r): for j in xrange(c): denom = denom + pt[i][j] denom.name = 'denom(%s)' % z_name off_prob = off_pt / denom p = 1. - off_prob p.name = 'p(%s)' % z_name hpart = [] for i in xrange(r): hpart.append([pt_ij / denom for pt_ij in pt[i]]) h = T.alloc(0., ch, zr, zc, batch_size) for i in xrange(r): for j in xrange(c): h.name = 'h_interm' h = T.set_subtensor(h[:, i:zr:r, j:zc:c, :], hpart[i][j]) h.name = 'h(%s)' % z_name if theano_rng is None: return p, h else: events = [] for i in xrange(r): for j in xrange(c): events.append(hpart[i][j]) events.append(off_prob) events = [event.dimshuffle(0, 1, 2, 3, 'x') for event in events] events = tuple(events) stacked_events = T.concatenate(events, axis=4) ch, rows, cols, batch_size, outcomes = stacked_events.shape reshaped_events = stacked_events.reshape( (ch * rows * cols * batch_size, outcomes)) multinomial = theano_rng.multinomial(pvals=reshaped_events, dtype=p.dtype) reshaped_multinomial = multinomial.reshape( (ch, rows, cols, batch_size, outcomes)) h_sample = T.alloc(0., ch, zr, zc, batch_size) idx = 0 for i in xrange(r): for j in xrange(c): h_sample = T.set_subtensor( h_sample[:, i:zr:r, j:zc:c, :], reshaped_multinomial[:, :, :, :, idx]) idx += 1 p_sample = 1 - reshaped_multinomial[:, :, :, :, -1] return p, h, p_sample, h_sample
def __init__(self, We, params): num_inputs = We.shape[1] lstm_layers_num = 1 self.eta = params.eta self.num_labels = params.num_labels self.en_hidden_size = params.en_hidden_size self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = params.lstm_layers_num self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs = tensor.imatrix() decoderInputs, decoderTarget = tensor.imatrices(2) encoderMask, TF, decoderMask, decoderInputs0 = tensor.fmatrices(4) self.lookuptable = theano.shared(We) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform( self.de_hidden_size + 2 * self.en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*en_hidden_size, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias] #concatenate self.params += [self.linear, self.linear_bias, self.de_lookuptable ] #the initial hidden state of decoder lstm is zeros #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape( (encoderInputs.shape[0], encoderInputs.shape[1], num_inputs)) for _ in range(self.lstm_layers_num): enclstm_f = LSTM(num_inputs, self.en_hidden_size) enclstm_b = LSTM(num_inputs, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask) hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask) hs = tensor.concatenate([hs_f, hs_b], axis=2) Cs = tensor.concatenate([Cs_f, Cs_b], axis=2) hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), self.Cos += tensor.alloc( np.asarray(0., dtype=theano.config.floatX), encoderInputs.shape[1], self.de_hidden_size), state_below = hs Encoder = state_below ei, di, dt = tensor.imatrices(3) #place holders em, dm, tf, di0 = tensor.fmatrices(4) self.encoder_function = theano.function(inputs=[ei, em], outputs=Encoder, givens={ encoderInputs: ei, encoderMask: em }) ##################################################### ##################################################### state_below = self.de_lookuptable[decoderInputs.flatten()].reshape( (decoderInputs.shape[0], decoderInputs.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) ##### Here we include the representation from the decoder decoder_lstm_outputs = tensor.concatenate([state_below, Encoder], axis=2) linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, _ = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]), y]) costs, _ = theano.scan( fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) updates = lasagne.updates.adam(loss, self.params, self.eta) #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function(inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs: di, decoderMask: dm, decoderTarget: dt }) ######################################################################### ### For schedule sampling ######################################################################### ###### always use privous predict as next input def _step2(ctx_, state_, hs_, Cs_): ### ctx_: b x h ### state_ : b x h ### hs_ : 1 x b x h the first dimension is the number of the decoder layers ### Cs_ : 1 x b x h the first dimension is the number of the decoder layers hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (ctx_.shape[0], self.de_hidden_size)) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = tensor.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = tensor.zeros_like(hs[:, :, 0]) state_below = tensor.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs ctx_0, state_0 = tensor.fmatrices(2) hs_0 = tensor.ftensor3() Cs_0 = tensor.ftensor3() state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0) self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0], [state_below_tmp, hs_tmp, Cs_tmp], name='f_next') hs0, Cs0 = tensor.as_tensor_variable( self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=encoderInputs.shape[0]) train_predict = train_outputs[0] train_costs, _ = theano.scan( fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask]) train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum( lasagne.regularization.l2(x) for x in self.params) ##from adam import adam ##train_updates = adam(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) #train_updates = lasagne.updates.sgd(train_loss, self.params, self.eta) #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9) from momentum import momentum train_updates = momentum(train_loss, self.params, params.eta, momentum=0.9) self._train2 = theano.function( inputs=[ei, em, di0, dm, dt], outputs=[train_loss, train_predict], updates=train_updates, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0, decoderMask: dm, decoderTarget: dt } #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf} ) listof_token_idx = train_predict.argmax(axis=-1) self._utter = theano.function(inputs=[ei, em, di0], outputs=listof_token_idx, givens={ encoderInputs: ei, encoderMask: em, decoderInputs0: di0 })
def gru_cond_layer(tparams, state_below, options, prefix='gru', mask=None, context=None, one_step=False, init_memory=None, init_state=None, alpha_past=None, context_mask=None, **kwargs): assert context, 'Context must be provided' if one_step: assert init_state, 'previous state must be provided' nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 # mask if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) dim = tparams[_p(prefix, 'Wcx')].shape[1] dimctx = tparams[_p(prefix, 'Wcx')].shape[0] pad = (tparams[_p(prefix, 'conv_Q')].shape[2]-1)//2 # initial/previous state if init_state is None: init_state = tensor.alloc(0., n_samples, dim) # projected context assert context.ndim == 3, \ 'Context must be 3-d: #annotation x #sample x dim' if alpha_past is None: alpha_past = tensor.alloc(0., n_samples, context.shape[0]) pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\ tparams[_p(prefix, 'b_att')] def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] # projected x state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\ tparams[_p(prefix, 'bx')] state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\ tparams[_p(prefix, 'b')] state_belowyg = tensor.dot(state_below, tparams[_p(prefix, 'Wyg')]) +\ tparams[_p(prefix, 'byg')] # state_below_ : x_ 1*dim ; state_belowx : xx_ 2*dim ; represents E*y def _step_slice(m_, x_, xx_, yg, h_, ctx_, alpha_, alpha_past_, beta, pctx_, cc_, U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl, conv_Q, conv_Uf, conv_b, Whg, bhg, Umg, W_m_att, U_when_att, c_when_att): preact1 = tensor.dot(h_, U) preact1 += x_ preact1 = tensor.nnet.sigmoid(preact1) r1 = _slice(preact1, 0, dim) # reset gate u1 = _slice(preact1, 1, dim) # update gate preactx1 = tensor.dot(h_, Ux) preactx1 *= r1 preactx1 += xx_ h1 = tensor.tanh(preactx1) h1 = u1 * h_ + (1. - u1) * h1 h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ g_m = tensor.dot(h_, Whg) + bhg g_m += yg g_m = tensor.nnet.sigmoid(g_m) mt = tensor.dot(h1, Umg) mt = tensor.tanh(mt) mt *= g_m # attention pstate_ = tensor.dot(h1, W_comb_att) # converage vector cover_F = theano.tensor.nnet.conv2d(alpha_past_[:,None,:,None],conv_Q,border_mode='half') # batch x dim x SeqL x 1 cover_F = cover_F.dimshuffle(1,2,0,3) # dim x SeqL x batch x 1 cover_F = cover_F.reshape([cover_F.shape[0],cover_F.shape[1],cover_F.shape[2]]) assert cover_F.ndim == 3, \ 'Output of conv must be 3-d: #dim x SeqL x batch' #cover_F = cover_F[:,pad:-pad,:] cover_F = cover_F.dimshuffle(1, 2, 0) # cover_F must be SeqL x batch x dimctx cover_vector = tensor.dot(cover_F, conv_Uf) + conv_b # cover_vector = cover_vector * context_mask[:,:,None] pctx__ = pctx_ + pstate_[None, :, :] + cover_vector #pctx__ += xc_ pctx__ = tensor.tanh(pctx__) alpha = tensor.dot(pctx__, U_att)+c_tt # compute alpha_when pctx_when = tensor.dot(mt, W_m_att) pctx_when += pstate_ pctx_when = tensor.tanh(pctx_when) alpha_when = tensor.dot(pctx_when, U_when_att)+c_when_att # batch * 1 alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) # SeqL * batch alpha = tensor.exp(alpha) alpha_when = tensor.exp(alpha_when) if context_mask: alpha = alpha * context_mask if context_mask: alpha_mean = alpha.sum(0, keepdims=True) / context_mask.sum(0, keepdims=True) else: alpha_mean = alpha.mean(0, keepdims=True) alpha_when = concatenate([alpha_mean, alpha_when.T], axis=0) # (SeqL+1)*batch alpha = alpha / alpha.sum(0, keepdims=True) alpha_when = alpha_when / alpha_when.sum(0, keepdims=True) beta = alpha_when[-1, :] alpha_past = alpha_past_ + alpha.T ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context ctx_ = beta[:, None] * mt + (1. - beta)[:, None] * ctx_ preact2 = tensor.dot(h1, U_nl)+b_nl preact2 += tensor.dot(ctx_, Wc) preact2 = tensor.nnet.sigmoid(preact2) r2 = _slice(preact2, 0, dim) u2 = _slice(preact2, 1, dim) preactx2 = tensor.dot(h1, Ux_nl)+bx_nl preactx2 *= r2 preactx2 += tensor.dot(ctx_, Wcx) h2 = tensor.tanh(preactx2) h2 = u2 * h1 + (1. - u2) * h2 h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 return h2, ctx_, alpha.T, alpha_past, beta # pstate_, preact, preactx, r, u seqs = [mask, state_below_, state_belowx, state_belowyg] #seqs = [mask, state_below_, state_belowx, state_belowc] _step = _step_slice shared_vars = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Wc')], tparams[_p(prefix, 'W_comb_att')], tparams[_p(prefix, 'U_att')], tparams[_p(prefix, 'c_tt')], tparams[_p(prefix, 'Ux')], tparams[_p(prefix, 'Wcx')], tparams[_p(prefix, 'U_nl')], tparams[_p(prefix, 'Ux_nl')], tparams[_p(prefix, 'b_nl')], tparams[_p(prefix, 'bx_nl')], tparams[_p(prefix, 'conv_Q')], tparams[_p(prefix, 'conv_Uf')], tparams[_p(prefix, 'conv_b')], tparams[_p(prefix, 'Whg')], tparams[_p(prefix, 'bhg')], tparams[_p(prefix, 'Umg')], tparams[_p(prefix, 'W_m_att')], tparams[_p(prefix, 'U_when_att')], tparams[_p(prefix, 'c_when_att')]] if one_step: rval = _step(*(seqs + [init_state, None, None, alpha_past, None, pctx_, context] + shared_vars)) else: rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[init_state, tensor.alloc(0., n_samples, context.shape[2]), tensor.alloc(0., n_samples, context.shape[0]), tensor.alloc(0., n_samples, context.shape[0]), tensor.alloc(0., n_samples,)], non_sequences=[pctx_, context]+shared_vars, name=_p(prefix, '_layers'), n_steps=nsteps, profile=profile, strict=True) return rval