Пример #1
0
class HighwayLayer(HiddenLayer):
    def __init__(self,
                 in_dim,
                 activation,
                 hidden_dim=None,
                 transform_gate="sigmoid",
                 prefix="",
                 initializer=default_initializer,
                 dropout=0,
                 verbose=True):
        # By construction the dimensions of in_dim and out_dim have to match, and hence W_T and W_H are square matrices.
        if hidden_dim is not None:
            assert in_dim == hidden_dim
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))
        super(HighwayLayer, self).__init__(in_dim, in_dim, activation, prefix,
                                           initializer, dropout, verbose)
        self.transform_gate = Activation(transform_gate)
        self.W_H, self.W_H.name = self.W, prefix + "W_H"
        self.b_H, self.b_H.name = self.b, prefix + "b_H"
        self.W_T = shared_rand_matrix((self.hidden_dim, self.in_dim),
                                      prefix + 'W_T', initializer)
        self.b_T = shared_zero_matrix((self.hidden_dim, ), prefix + 'b_T')
        self.params = [self.W_H, self.W_T, self.b_H, self.b_T]
        self.norm_params = [self.W_H, self.W_T]
        self.l1_norm = T.sum(
            [T.sum(T.abs_(param)) for param in self.norm_params])
        self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params])

        if verbose:
            logger.debug('Architecture of {} built finished'.format(
                self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Transform Gate:   %s' % self.transform_gate.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)

    def forward(self, x):
        """
        :param x: (in, )
        """
        # (in, in) (in, ) + (in, ) -> (in)
        t = self.transform_gate.activate(T.dot(self.W_T, x) + self.b_T)
        # (in, in) (in, ) + (in, ) -> (in)
        z_t = self.act.activate(T.dot(self.W_H, x) + self.b_H)
        # (in, ) * (in, ) + (in, ) * (in, ) -> (in, )
        return t * z_t + (1 - t) * x

    def forward_batch(self, x):
        """
        :param x: (batch, in)
        """
        # (batch, in) (in, in) + (in, ) -> (batch, in)
        t = self.transform_gate.activate(T.dot(x, self.W_T.T) + self.b_T)
        # (batch, in) (in, in) + (in, ) -> (batch, in)
        z_t = self.act.activate(T.dot(x, self.W_H.T) + self.b_H)
        # (batch, in) * (batch, in) + (batch, in) * (batch, in) -> (batch, in)
        return t * z_t + (1 - t) * x
Пример #2
0
class HiddenLayer(object):
    def __init__(self,
                 in_dim,
                 hidden_dim,
                 activation,
                 prefix="",
                 initializer=default_initializer,
                 dropout=0,
                 verbose=True):
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.out_dim = hidden_dim
        self.act = Activation(activation)
        self.dropout = dropout
        self.W = shared_rand_matrix((self.hidden_dim, self.in_dim),
                                    prefix + 'W', initializer)
        self.b = shared_zero_matrix((self.hidden_dim, ), prefix + 'b')
        self.params = [self.W, self.b]
        self.norm_params = [self.W]
        self.l1_norm = T.sum(
            [T.sum(T.abs_(param)) for param in self.norm_params])
        self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params])

        if verbose:
            logger.debug('Architecture of {} built finished'.format(
                self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)

    def forward(self, x):
        """
        :param x: (dim, )
        """
        output = self.act.activate(T.dot(self.W, x) + self.b)
        return dropout_from_layer(output, self.dropout)

    def forward_batch(self, x):
        """
        :param x: (batch, dim)
        """
        # (batch, in) (in, hidden) + (None, hidden) -> (batch, hidden)
        output = self.act.activate(T.dot(x, self.W.T) + self.b)
        return dropout_from_layer(output, self.dropout)
Пример #3
0
class BRNN(object):
	'''
	Bidirectional RNN. This is just a trial for using 
	BRNN as a tool for sentence modeling.

	First trial on the task of sentiment analysis.
	'''
	def __init__(self, configs, verbose=True):
		if verbose: pprint('Build Tied weights Bidirectional Recurrent Neural Network')
		self.input = T.matrix(name='input')
		self.truth = T.ivector(name='label')
		self.learn_rate = T.scalar(name='learn rate')
		# Configure Activation function
		self.act = Activation(configs.activation)
		# Build bidirectional RNN with tied weights
		num_input, num_hidden, num_class = configs.num_input, configs.num_hidden, configs.num_class
		# Stack all the variables together into a vector in order to apply the batch updating algorithm
		# Since there are two directions for the RNN, all the weight matrix associated with RNN will be 
		# duplicated
		num_params = 2 * (num_input * num_hidden + \
					 num_hidden * num_hidden + \
					 num_hidden) + \
					 2 * num_hidden * num_class + \
					 num_class
		self.num_params = num_params
		self.theta = theano.shared(value=np.zeros(num_params, dtype=floatX), name='theta', borrow=True)
		# Incremental index
		param_idx = 0
		# 1, Feed-forward matrix for forward direction: W_forward
		self.W_forward = self.theta[param_idx: param_idx+num_input*num_hidden].reshape((num_input, num_hidden))
		self.W_forward.name = 'W_forward_RNN'
		W_forward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_input+num_hidden)),
									  		  		  high=np.sqrt(6.0/(num_input+num_hidden)),
									  		  		  size=(num_input, num_hidden)), dtype=floatX)
		param_idx += num_input * num_hidden
		# 1, Feed-forward matrix for backward direction: W_backward
		self.W_backward = self.theta[param_idx: param_idx+num_input*num_hidden].reshape((num_input, num_hidden))
		self.W_backward.name = 'W_backward_RNN'
		W_backward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_input+num_hidden)),
													   high=np.sqrt(6.0/(num_input+num_hidden)),
													   size=(num_input, num_hidden)), dtype=floatX)
		param_idx += num_input * num_hidden
		# 2, Recurrent matrix for forward direction: U_forward
		self.U_forward = self.theta[param_idx: param_idx+num_hidden*num_hidden].reshape((num_hidden, num_hidden))
		self.U_forward.name = 'U_forward_RNN'
		U_forward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_hidden+num_hidden)),
													  high=np.sqrt(6.0/(num_hidden+num_hidden)),
													  size=(num_hidden, num_hidden)), dtype=floatX)
		param_idx += num_hidden * num_hidden
		# 2, Recurrent matrix for backward direction: U_backward
		self.U_backward = self.theta[param_idx: param_idx+num_hidden*num_hidden].reshape((num_hidden, num_hidden))
		self.U_backward.name = 'U_backward_RNN'
		U_backward_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(num_hidden+num_hidden)),
													   high=np.sqrt(6.0/(num_hidden+num_hidden)),
													   size=(num_hidden, num_hidden)), dtype=floatX)
		param_idx += num_hidden * num_hidden
		# 3, Bias parameter for the hidden-layer forward direction RNN
		self.b_forward = self.theta[param_idx: param_idx+num_hidden]
		self.b_forward.name = 'b_forward_RNN'
		b_forward_init = np.zeros(num_hidden, dtype=floatX)		
		param_idx += num_hidden
		# 3, Bias parameter for the hidden-layer backward direction RNN
		self.b_backward = self.theta[param_idx: param_idx+num_hidden]
		self.b_backward.name = 'b_backward_RNN'
		b_backward_init = np.zeros(num_hidden, dtype=floatX)
		param_idx += num_hidden
		# Weight matrix for softmax function
		self.W_softmax = self.theta[param_idx: param_idx+2*num_hidden*num_class].reshape((2*num_hidden, num_class))
		self.W_softmax.name = 'W_softmax'
		W_softmax_init = np.asarray(np.random.uniform(low=-np.sqrt(6.0/(2*num_hidden+num_class)), 
													  high=np.sqrt(6.0/(2*num_hidden+num_class)),
													  size=(2*num_hidden, num_class)), dtype=floatX)
		param_idx += 2*num_hidden*num_class
		# Bias vector for softmax function
		self.b_softmax = self.theta[param_idx: param_idx+num_class]
		self.b_softmax.name = 'b_softmax'
		b_softmax_init = np.zeros(num_class, dtype=floatX)
		param_idx += num_class
		# Set all the default parameters into theta
		self.theta.set_value(np.concatenate([x.ravel() for x in 
			(W_forward_init, W_backward_init, U_forward_init, U_backward_init, 
			 b_forward_init, b_backward_init, W_softmax_init, b_softmax_init)]))
		assert param_idx == num_params
		# h[0], zero vector, treated as constants
		self.h_start = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h_start', borrow=True)
		self.h_end = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h_end', borrow=True)
		# recurrent function used to compress a sequence of input vectors
		# the first dimension should correspond to time
		def forward_step(x_t, h_tm1):
			h_t = self.act.activate(T.dot(x_t, self.W_forward) + \
									T.dot(h_tm1, self.U_forward) + self.b_forward)
			return h_t
		def backward_step(x_t, h_tm1):
			h_t = self.act.activate(T.dot(x_t, self.W_backward) + \
									T.dot(h_tm1, self.U_backward) + self.b_backward)
			return h_t
		# Forward and backward representation over time
		self.forward_h, _ = theano.scan(fn=forward_step, sequences=self.input, outputs_info=[self.h_start],
										truncate_gradient=configs.bptt)
		self.backward_h, _ = theano.scan(fn=backward_step, sequences=self.input, outputs_info=[self.h_end], 
										 truncate_gradient=configs.bptt, go_backwards=True)
		# Store the final value
		# self.h_start_star = self.forward_h[-1]
		# self.h_end_star = self.backward_h[-1]
		self.h_start_star = T.mean(self.forward_h, axis=0)
		self.h_end_star = T.mean(self.backward_h, axis=0)
		# L1, L2 regularization
		self.L1_norm = T.sum(T.abs_(self.W_forward) + T.abs_(self.W_backward) + \
							 T.abs_(self.U_forward) + T.abs_(self.U_backward) + \
							 T.abs_(self.W_softmax))
		self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \
					   T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2) + \
					   T.sum(self.W_softmax ** 2)
		# Build function to show the learned representation for different sentences
		self.show_forward = theano.function(inputs=[self.input], outputs=self.h_start_star)
		self.show_backward = theano.function(inputs=[self.input], outputs=self.h_end_star)
		##################################################################################
		# Correlated BRNN
		##################################################################################
		# Concatenate these two vectors into one
		self.h = T.concatenate([self.h_start_star, self.h_end_star], axis=0)
		# Dropout parameter
		srng = T.shared_randomstreams.RandomStreams(configs.random_seed)
		mask = srng.binomial(n=1, p=1-configs.dropout, size=self.h.shape)
		self.h *= T.cast(mask, floatX)
		# Use concatenated vector as input to the Softmax/MLP classifier
		self.output = T.nnet.softmax(T.dot(self.h, self.W_softmax) + self.b_softmax)		
		self.pred = T.argmax(self.output, axis=1)
		# Build cost function
		self.cost = -T.mean(T.log(self.output)[T.arange(self.truth.shape[0]), self.truth])
		if configs.regularization:
			self.cost += configs.lambda1 * self.L2_norm
		# Compute gradient
		self.gradtheta = T.grad(self.cost, self.theta)
		self.gradinput = T.grad(self.cost, self.input)
		# Build objective function
		# Compute the gradients to parameters
		self.compute_cost_and_gradient = theano.function(inputs=[self.input, self.truth], 
												outputs=[self.cost, self.gradtheta])
		# Compute the gradients to inputs
		self.compute_input_gradient = theano.function(inputs=[self.input, self.truth],
												outputs=self.gradinput)
		# Build prediction function
		self.predict = theano.function(inputs=[self.input], outputs=self.pred)
		if verbose:
			pprint('*' * 50)
			pprint('Finished constructing Bidirectional Recurrent Neural Network (BRNN)')
			pprint('Size of input dimension: %d' % configs.num_input)
			pprint('Size of hidden/recurrent dimension: %d' % configs.num_hidden)
			pprint('Size of output dimension: %d' % configs.num_class)
			pprint('Is regularization applied? %s' % ('yes' if configs.regularization else 'no'))
			if configs.regularization:
				pprint('Coefficient of regularization term: %f' % configs.lambda1)
			pprint('BPTT step: %d' % configs.bptt)
			pprint('Number of free parameters in BRNN: %d' % self.num_params)
			pprint('*' * 50)

	# This method is used to implement the batch updating algorithm
	def update_params(self, gradtheta, learn_rate):
		# gradparams is a single long vector which can be used to update self.theta
		# Learning algorithm: simple stochastic gradient descent
		theta = self.theta.get_value(borrow=True)
		self.theta.set_value(theta - learn_rate * gradtheta, borrow=True)

	@staticmethod
	def save(fname, model):
		with file(fname, 'wb') as fout:
			cPickle.dump(model, fout)

	@staticmethod
	def load(fname):
		with file(fname, 'rb') as fin:
			return cPickle.load(fin)
Пример #4
0
class BRNNEncoder(object):
	'''
	Bidirectional RNN for sequence encoding. 
	'''
	def __init__(self, config, verbose=True):
		if verbose: logger.debug('Building Bidirectional RNN Encoder...')
		self.input = T.matrix(name='BRNNEncoder_input')
		# Configure Activation function
		self.act = Activation(config.activation)
		# Build Bidirectional RNN
		num_input, num_hidden = config.num_input, config.num_hidden
		self.num_params = 2 * (num_input * num_hidden + num_hidden * num_hidden + num_hidden)
		# Initialize model parameters
		np.random.seed(config.random_seed)
		# 1, Feed-forward matrix for forward direction: W_forward
		W_forward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_input, num_hidden))
		W_forward_val = W_forward_val.astype(floatX)
		self.W_forward = theano.shared(value=W_forward_val, name='W_forward', borrow=True)
		# 1, Feed-forward matrix for backward direction: W_backward
		W_backward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_input, num_hidden))
		W_backward_val = W_backward_val.astype(floatX)
		self.W_backward = theano.shared(value=W_backward_val, name='W_backward', borrow=True)
		# 2, Recurrent matrix for forward direction: U_forward
		U_forward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_hidden, num_hidden))
		U_forward_val = U_forward_val.astype(floatX)
		U_forward_val, _, _ = np.linalg.svd(U_forward_val)
		self.U_forward = theano.shared(value=U_forward_val, name='U_forward', borrow=True)
		# 2, Recurrent matrix for backward direction: U_backward
		U_backward_val = np.random.uniform(low=-1.0, high=1.0, size=(num_hidden, num_hidden))
		U_backward_val = U_backward_val.astype(floatX)
		U_backward_val, _, _ = np.linalg.svd(U_backward_val)
		self.U_backward = theano.shared(value=U_backward_val, name='U_backward', borrow=True)
		# 3, Bias parameter for the hidden-layer forward direction RNN
		b_forward_val = np.zeros(num_hidden, dtype=floatX)
		self.b_forward = theano.shared(value=b_forward_val, name='b_forward', borrow=True)
		# 3, Bias parameter for the hidden-layer backward direction RNN
		b_backward_val = np.zeros(num_hidden, dtype=floatX)
		self.b_backward = theano.shared(value=b_backward_val, name='b_backward', borrow=True)
		# h[0], zero vectors, treated as constants
		self.h0_forward = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h0_forward', borrow=True)
		self.h0_backward = theano.shared(value=np.zeros(num_hidden, dtype=floatX), name='h0_backward', borrow=True)
		# Stack all the parameters
		self.params = [self.W_forward, self.W_backward, self.U_forward, self.U_backward, 
					   self.b_forward, self.b_backward]
		# Compute the forward and backward representation over time
		self.h_forwards, _ = theano.scan(fn=self._forward_step, 
										 sequences=self.input, 
										 outputs_info=[self.h0_forward],
										 truncate_gradient=config.bptt)
		self.h_backwards, _ = theano.scan(fn=self._backward_step,
										  sequences=self.input,
										  outputs_info=[self.h0_backward],
										  truncate_gradient=config.bptt,
										  go_backwards=True)
		# Average compressing
		self.h_forward = T.mean(self.h_forwards, axis=0)
		self.h_backward = T.mean(self.h_backwards, axis=0)
		# Concatenate
		self.output = T.concatenate([self.h_forward, self.h_backward], axis=0)
		# L1, L2 regularization
		self.L1_norm = T.sum(T.abs_(self.W_forward) + T.abs_(self.W_backward) + 
							 T.abs_(self.U_forward) + T.abs_(self.U_backward))
		self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \
					   T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2)
		if verbose:
			logger.debug('Finished constructing the structure of BRNN Encoder: ')
			logger.debug('Size of the input dimension: %d' % num_input)
			logger.debug('Size of the hidden dimension: %d' % num_hidden)
			logger.debug('Activation function: %s' % config.activation)

	def _forward_step(self, x_t, h_tm1):
		h_t = self.act.activate(T.dot(x_t, self.W_forward) + \
								T.dot(h_tm1, self.U_forward) + \
								self.b_forward)
		return h_t

	def _backward_step(self, x_t, h_tm1):
		h_t = self.act.activate(T.dot(x_t, self.W_backward) + \
								T.dot(h_tm1, self.U_backward) + \
								self.b_backward)
		return h_t				

	def encode(self, inputM):
		'''
		@inputM: Theano symbol matrix. Compress the input matrix into output vector.
		'''
		h_forwards, _ = theano.scan(fn=self._forward_step, 
									sequences=inputM,
									outputs_info=[self.h0_forward])
		h_backwards, _ = theano.scan(fn=self._backward_step, 
									 sequences=inputM,
									 outputs_info=[self.h0_backward],
									 go_backwards=True)
		# Averaging
		h_forward = T.mean(h_forwards, axis=0)
		h_backward = T.mean(h_backwards, axis=0)
		# Concatenate
		h = T.concatenate([h_forward, h_backward], axis=0)
		return h
Пример #5
0
class RNN(object):
	'''
	Basic component for Recurrent Neural Network
	'''
	def __init__(self, configs=None, verbose=True):
		'''
		Basic RNN is an unsupervised component, where the input is a sequence and the 
		output is a vector with fixed length
		'''
		if verbose: pprint('Build Recurrent Neural Network...')
		self.input = T.matrix(name='input', dtype=floatX)
		self.learn_rate = T.scalar(name='learn rate')		
		# Configure activation function
		self.act = Activation(configs.activation)
		fan_in = configs.num_input
		fan_out = configs.num_hidden
		# Initialize all the variables in RNN, including:
		# 1, Feed-forward matrix, feed-forward bias, W, W_b
		# 2, Recurrent matrix, recurrent bias, U, U_b
		self.W = theano.shared(value=np.asarray(
					np.random.uniform(low=-np.sqrt(6.0/(fan_in+fan_out)),
									  high=np.sqrt(6.0/(fan_in+fan_out)), 
									  size=(fan_in, fan_out)), dtype=floatX),
					name='W', borrow=True)
		self.U = theano.shared(value=np.asarray(
					np.random.uniform(low=-np.sqrt(6.0/(fan_out+fan_out)),
									  high=np.sqrt(6.0/(fan_out+fan_out)),
									  size=(fan_out, fan_out)), dtype=floatX),
					name='U', borrow=True)
		# Bias parameter for the hidden-layer encoder of RNN
		self.b = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='b', borrow=True)
		# h[0], zero vector
		self.h0 = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='h0', borrow=True)
		# Save all the parameters
		self.params = [self.W, self.U, self.b, self.h0]
		# recurrent function used to compress a sequence of input vectors
		# the first dimension should correspond to time
		def step(x_t, h_tm1):
			h_t = self.act.activate(T.dot(x_t, self.W) + \
									T.dot(h_tm1, self.U) + self.b)
			return h_t
		# h is the hidden representation over a time sequence
		self.hs, _ = theano.scan(fn=step, sequences=self.input, outputs_info=[self.h0],
								truncate_gradient=configs.bptt)
		self.h = self.hs[-1]
		# L1, L2 regularization
		self.L1_norm = T.sum(T.abs_(self.W) + T.abs_(self.U))
		self.L2_norm = T.sum(self.W ** 2) + T.sum(self.U ** 2)
		# Compress function
		self.compress = theano.function(inputs=[self.input], outputs=self.h)

	@staticmethod
	def save(fname, model):
		'''
		Save current RNN model into fname
		@fname: String. Filename to save the model.
		@model: RNN. An instance of RNN class.
		'''
		with file(fname, 'wb') as fout:
			cPickle.dump(model, fout)

	@staticmethod
	def load(fname):
		'''
		Load an RNN model from fname
		@fname: String. Filename to load the model.
		'''
		with file(fname, 'rb') as fin:
			return cPickle.load(fin)
Пример #6
0
class ConvolutionLayer(object):
    def __init__(self, in_dim, hidden_dim, kernel_size=3, padding='same', pooling='max', dilation_rate=1.0,
                 activation='relu', prefix="", initializer=GlorotUniformInitializer(), dropout=0.0, verbose=True):
        """
        Init Function for ConvolutionLayer
        :param in_dim:
        :param hidden_dim:
        :param kernel_size:
        :param padding: 'same', 'valid'
        :param pooling: 'max', 'mean', 'min'
        :param dilation_rate:
        :param activation:
        :param prefix:
        :param initializer:
        :param dropout:
        :param verbose:
        """
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))

        self.in_dim = in_dim
        self.out_dim = hidden_dim
        self.hidden_dim = hidden_dim
        self.kernel_size = kernel_size
        self.padding = padding
        self.dilation_rate = dilation_rate
        self.pooling = pooling
        self.dropout = dropout
        self.act = Activation(activation)
        self.padding_size = int(self.dilation_rate * (self.kernel_size - 1))
        # Composition Function Weight
        # Kernel Matrix (kernel_size, hidden, in)
        self.W = shared_rand_matrix((self.kernel_size, self.hidden_dim, self.in_dim), prefix + 'W', initializer)
        # Bias Term (hidden)
        self.b = shared_zero_matrix((self.hidden_dim,), prefix + 'b')

        self.params = [self.W, self.b]
        self.norm_params = [self.W]

        # L1, L2 Norm
        self.l1_norm = T.sum(T.abs_(self.W))
        self.l2_norm = T.sum(self.W ** 2)

        if verbose:
            logger.debug('Architecture of {} built finished'.format(self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Filter Num  (Hidden): %d' % self.hidden_dim)
            logger.debug('Kernel Size (Windows): %d' % self.kernel_size)
            logger.debug('Padding method :  %s' % self.padding)
            logger.debug('Dilation Rate  :  %s' % self.dilation_rate)
            logger.debug('Padding Size   :  %s' % self.padding_size)
            logger.debug('Pooling method :  %s' % self.pooling)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)

    def forward_conv(self, x):
        """
        #TODO
        :param x: (length, dim)
        :return:  (length+2*(kernel-1, hidden_dim)
        """
        # T.nn.conv2d (batch size, input channels, input rows, input columns)
        # dl4nlp      (batch size, 1,              length,     in_dim)
        x = x.dimshuffle(['x', 'x', 0, 1])
        # T.nn.conv2d (output channels, input channels, filter rows, filter columns)
        # dl4nlp      (hidden_dim,      1,              kernel_size, in_dim)
        filter_w = self.W.dimshuffle([1, 'x', 0, 2])
        # T.nn.conv2d (batch size, output channels, output rows,     output columns)
        # dl4nlp      (batch size, hidden_dim,      length+kernel-1, 1)
        conv_result = T.nnet.conv2d(x, filter_w,
                                    border_mode='valid',)
        # (batch size, hidden_dim, length+kernel-1, 1) -> (length+kernel-1, hidden_dim)
        conv_result = T.transpose(conv_result[0, :, :, 0], (1, 0))
        return conv_result

    def forward_conv_batch(self, x):
        """
        :param x: (batch, length, dim)
        :return:  (batch, length - kernel + 2*padding_size + 1, hidden_dim)
        """
        # T.nn.conv2d (batch size, input channels, input rows, input columns)
        # dl4nlp      (batch size, 1,              length,     in_dim)
        x = x.dimshuffle([0, 'x', 1, 2])
        # T.nn.conv2d (output channels, input channels, filter rows, filter columns)
        # dl4nlp      (hidden_dim,      1,              kernel_size, in_dim)
        filter_w = self.W.dimshuffle([1, 'x', 0, 2])
        # T.nn.conv2d (batch size, output channels, output rows,     output columns)
        # dl4nlp      (batch size, hidden_dim,      length+kernel-1, 1)
        conv_result = T.nnet.conv2d(x, filter_w,
                                    border_mode='valid',)
        # from theano.printing import Print
        # conv_result = Print()(conv_result)
        # (batch size, hidden_dim, length - kernel + 2*padding_size + 1, 1)
        #   -> (batch, length - kernel + 2*padding_size + 1, hidden_dim)
        conv_result = T.transpose(conv_result[:, :, :, 0], (0, 2, 1))
        return conv_result

    def forward(self, x):
        """
        :param x: (length, dim)
        :return: (hidden_dim, )
        """
        if self.padding_size > 0:
            # (padding_size + length + padding_size, dim)
            x = temporal_padding_2d(x, (self.padding_size, self.padding_size))
        safe_x = temporal_padding_2d(x, (0, self.kernel_size - x.shape[0]))
        # If Kernel Size is greater than sentence length, padding at the end of sentence
        x = ifelse(T.gt(self.kernel_size - x.shape[0], 0),
                   safe_x,
                   x)
        conv_result = self.forward_conv(x)
        pooling_result = get_pooling(conv_result, self.pooling)
        dropout_out = dropout_from_layer(pooling_result, self.dropout)
        return self.act.activate(dropout_out + self.b)

    def forward_batch(self, x, mask):
        """
        :param x: (batch, length, dim)
        :param mask: (batch, length, )
        :return: (batch, length, hidden_dim)
        """
        # conv_after_length = length - kernel + 2 * padding_size + 1
        new_x = x
        if self.padding_size > 0:
            # (padding_size + length + padding_size, dim)
            new_x = temporal_padding_3d(x, (self.padding_size, self.padding_size))
            # (batch, conv_after_length)
            mask = temporal_padding_mask(mask, kernel_size=self.kernel_size, padding_size=self.padding_size)
        elif self.padding_size == 0:
            # (batch, conv_after_length)
            mask = temporal_padding_mask(mask, kernel_size=self.kernel_size, padding_size=0)
        else:
            raise RuntimeError("Dilation Rate >= 0")
        # safe_x = temporal_padding_3d(x, (0, self.kernel_size - x.shape[1]))
        # safe_mask = T.ones((x.shape[0], ), dtype=theano.config.floatX).dimshuffle([0, 'x'])
        # !!! convert safe_mask from col to matrix
        # safe_mask = T.unbroadcast(safe_mask, 1)
        # x, mask = ifelse(T.gt(self.kernel_size - x.shape[1], 0),
        #                  (safe_x, safe_mask),
        #                  (new_x, mask))
        # (batch, conv_after_length, hidden_dim)
        conv_result = self.forward_conv_batch(new_x)
        # new_x = Print(new_x)
        # mask = Print()(mask)
        pooling_result = get_pooling_batch(conv_result, mask, self.pooling)
        dropout_out = dropout_from_layer(pooling_result, self.dropout)
        return self.act.activate(dropout_out + self.b)
Пример #7
0
class GrCNNEncoder(object):
    '''
    (Binary) Gated Recursive Convolutional Neural Network Encoder.
    '''
    def __init__(self, config=None, verbose=True):
        '''
        @config: GRCNNConfiger. Configer used to set the architecture of GRCNNEncoder.
        '''
        if verbose:
            logger.debug(
                'Building Gated Recursive Convolutional Neural Network Encoder...'
            )
        # Scale factor for initializing parameters
        self.scale = config.scale
        # Make theano symbolic tensor for input and model parameters
        self.input = T.matrix(name='GrCNN Encoder input', dtype=floatX)
        # Configure activation function
        self.act = Activation(config.activation)
        fan_in, fan_out = config.num_input, config.num_hidden
        # Initialize model parameters
        # Set seed of the random generator
        np.random.seed(config.random_seed)
        # Projection matrix U
        # Initialize all the matrices using orthogonal matrices
        U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out))
        U_val = U_val.astype(floatX)
        U_val *= self.scale
        self.U = theano.shared(value=U_val, name='U', borrow=True)
        self.hidden0 = T.dot(self.input, self.U)

        # W^l, W^r, parameters used to construct the central hidden representation
        Wl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out))
        Wl_val = Wl_val.astype(floatX)
        Wl_val, _, _ = np.linalg.svd(Wl_val)
        # Wl_val *= self.scale
        self.Wl = theano.shared(value=Wl_val, name='W_l', borrow=True)

        Wr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out))
        Wr_val = Wr_val.astype(floatX)
        Wr_val, _, _ = np.linalg.svd(Wr_val)
        # Wr_val *= self.scale
        self.Wr = theano.shared(value=Wr_val, name='W_r', borrow=True)

        self.Wb = theano.shared(value=np.zeros(fan_out, dtype=floatX),
                                name='Wb',
                                borrow=True)

        # G^l, G^r, parameters used to construct the three-way coefficients
        Gl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3))
        Gl_val = Gl_val.astype(floatX)
        self.Gl = theano.shared(value=Gl_val, name='G_l', borrow=True)

        Gr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3))
        Gr_val = Gr_val.astype(floatX)
        self.Gr = theano.shared(value=Gr_val, name='G_r', borrow=True)

        self.Gb = theano.shared(value=np.zeros(3, dtype=floatX),
                                name='Gb',
                                borrow=True)
        # Save all the parameters into one batch
        self.params = [
            self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb
        ]
        # Compute the total number of parameters
        self.num_params = reduce(lambda x, y: x + np.prod(y.get_value().shape),
                                 self.params, 0)
        # Length of the time sequence
        self.nsteps = self.input.shape[0]
        self.pyramids, _ = theano.scan(fn=self._step_prop,
                                       sequences=T.arange(self.nsteps - 1),
                                       non_sequences=self.nsteps,
                                       outputs_info=[self.hidden0],
                                       n_steps=self.nsteps - 1)
        self.output = self.pyramids[-1][0].dimshuffle('x', 0)
        # Compression -- Encoding function
        self.compress = theano.function(inputs=[self.input],
                                        outputs=self.output)
        if verbose:
            logger.debug(
                'Finished constructing the structure of grCNN Encoder: ')
            logger.debug('Size of the input dimension: %d' % fan_in)
            logger.debug('Size of the hidden dimension: %d' % fan_out)
            logger.debug('Activation function: %s' % config.activation)

    def _step_prop(self, iter, current_level, nsteps):
        '''
        @current_level: Input matrix at current level. The first dimension corresponds to 
        the timestamp while the second dimension corresponds to the dimension of hidden representation
        '''
        # Build shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the
        # input and output matrix
        left_current_level = current_level[:nsteps - iter - 1]
        right_current_level = current_level[1:nsteps - iter]
        # Compute temporary central hidden representation, of size Txd, but we only care about the first
        # T-1 rows, i.e., we only focus on the (T-1)xd sub-matrix.
        central_current_level = self.act.activate(
            T.dot(left_current_level, self.Wl) +
            T.dot(right_current_level, self.Wr) + self.Wb)
        # Compute gating function, of size Tx3. Again, due to the internal limitation of Theano.scan, we cannot
        # reduce the size of the matrix and have to keep the same size, but actually we only want the first (T-1)x3
        # sub-matrix.
        current_gates = T.nnet.softmax(
            T.dot(left_current_level, self.Gl) +
            T.dot(right_current_level, self.Gr) + self.Gb)
        left_gate, central_gate, right_gate = current_gates[:,
                                                            0], current_gates[:,
                                                                              1], current_gates[:,
                                                                                                2]
        # Reshape for broadcasting
        left_gate = left_gate.dimshuffle(0, 'x')
        central_gate = central_gate.dimshuffle(0, 'x')
        right_gate = right_gate.dimshuffle(0, 'x')
        # Build next level of hidden representation using soft combination,
        # matrix of size (T-1)xd
        next_level = left_gate * left_current_level + \
                     right_gate * right_current_level + \
                     central_gate * central_current_level
        return T.set_subtensor(current_level[:nsteps - iter - 1], next_level)

    def _step_prop_reduce(self, current_level):
        '''
        @current_level: Input matrix at current level. The first dimension corresponds to 
        the timestamp while the second dimension corresponds to the dimension of hidden representation

        Reduced version of level propagation, much more memory and time efficient implementation, but cannot
        be used inside theano.scan because theano.scan requires that the input and output through timestamps should
        have the same shape.
        '''
        # Build shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the
        # input and output matrix
        right_current_level = current_level[1:]
        left_current_level = current_level[:-1]
        # Compute temporary central hidden representation, of size Txd, but we only care about the first
        # T-1 rows, i.e., we only focus on the (T-1)xd sub-matrix.
        central_current_level = self.act.activate(
            T.dot(left_current_level, self.Wl) +
            T.dot(right_current_level, self.Wr) + self.Wb)
        # Compute gating function, of size Tx3. Again, due to the internal limitation of Theano.scan, we cannot
        # reduce the size of the matrix and have to keep the same size, but actually we only want the first (T-1)x3
        # sub-matrix.
        current_gates = T.nnet.softmax(
            T.dot(left_current_level, self.Gl) +
            T.dot(right_current_level, self.Gr) + self.Gb)
        left_gate, central_gate, right_gate = current_gates[:,
                                                            0], current_gates[:,
                                                                              1], current_gates[:,
                                                                                                2]
        # Reshape for broadcasting
        left_gate = left_gate.dimshuffle(0, 'x')
        central_gate = central_gate.dimshuffle(0, 'x')
        right_gate = right_gate.dimshuffle(0, 'x')
        # Build next level of hidden representation using soft combination,
        # matrix of size (T-1)xd
        next_level = left_gate * left_current_level + \
                     right_gate * right_current_level + \
                     central_gate * central_current_level
        return next_level

    def encode(self, inputM):
        '''
        @input: Theano symbol matrix. Compress the input matrix into output vector.
        '''
        hidden = T.dot(inputM, self.U)
        # Length of the time sequence
        nsteps = inputM.shape[0]
        pyramids, _ = theano.scan(fn=self._step_prop,
                                  sequences=T.arange(nsteps - 1),
                                  non_sequences=nsteps,
                                  outputs_info=[hidden],
                                  n_steps=nsteps - 1)
        output = pyramids[-1][0].dimshuffle('x', 0)
        return output

    def L2_loss(self):
        '''
        Return L2 norm of the model parameters.
        '''
        return T.sum(self.U ** 2) + T.sum(self.Wl ** 2) + T.sum(self.Wr ** 2) + \
               T.sum(self.Gl ** 2) + T.sum(self.Gr ** 2)
Пример #8
0
class ExtGrCNNEncoder(object):
    '''
    An extension of the canonical GrCNN, with more than 1 gate at each local binary window.
    '''
    def __init__(self, config, verbose=True):
        '''
        @config: GrCNNConfiger. Configer used to set the architecture of ExtGrCNNEncoder.
        '''
        if verbose: logger.debug('Building Extended Gated Recursive Convolutional Neural Network Encoder...')
        # Scale factor for initializing model parameters
        self.scale = config.scale
        # Make theano symbolic tensor for input and model parameters
        self.input = T.matrix(name='ExtGrCNNEncoder input', dtype=floatX)
        # Configure activation function
        self.act = Activation(config.activation)
        fan_in, fan_out = config.num_input, config.num_hidden
        # Initialize model parameter
        np.random.seed(config.random_seed)
        # Projection matrix U
        U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out))
        U_val = U_val.astype(floatX)
        U_val *= self.scale
        self.U = theano.shared(value=U_val, name='U', borrow=True)
        self.hidden0 = T.dot(self.input, self.U)
        # 3rd-tensor to implement the multi-gate GrCNN Encoders, where the first dimension corresponds
        # to the number of gates
        Wl_vals = [np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)).astype(floatX) for _ in xrange(config.num_gates)]
        Wl_vals = [np.linalg.svd(Wl_val)[0] for Wl_val in Wl_vals]
        Wl_vals = np.asarray(Wl_vals)
        self.Wl = theano.shared(value=Wl_vals, name='W_l', borrow=True)

        Wr_vals = [np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out)).astype(floatX) for _ in xrange(config.num_gates)]
        Wr_vals = [np.linalg.svd(Wr_val)[0] for Wr_val in Wr_vals]
        Wr_vals = np.asarray(Wr_vals)
        self.Wr = theano.shared(value=Wr_vals, name='W_r', borrow=True)

        self.Wb = theano.shared(value=np.zeros((config.num_gates, fan_out), dtype=floatX), name='W_b', borrow=True)
        # Multi-gate choosing functions
        Gl_vals = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, config.num_gates+2)).astype(floatX)
        self.Gl = theano.shared(value=Gl_vals, name='G_l', borrow=True)

        Gr_vals = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, config.num_gates+2)).astype(floatX)
        self.Gr = theano.shared(value=Gr_vals, name='G_r', borrow=True)

        self.Gb = theano.shared(value=np.zeros(config.num_gates+2, dtype=floatX), name='G_b', borrow=True)
        # Stack all the model parameters
        self.params = [self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb]
        self.num_params = fan_in * fan_out + 2 * config.num_gates * fan_out * fan_out + config.num_gates * fan_out + \
                          2 * (config.num_gates+2) * fan_out + config.num_gates + 2
        # Length of the time sequence
        self.nsteps = self.input.shape[0]
        # Building ExtGrCNNEncoder pyramids
        self.pyramids, _ = theano.scan(fn=self._step_prop, 
                                    sequences=T.arange(self.nsteps-1),
                                    non_sequences=self.nsteps,
                                    outputs_info=[self.hidden0],
                                    n_steps=self.nsteps-1)
        self.output = self.pyramids[-1][0].dimshuffle('x', 0)
        # Compression -- Encoding function
        self.compress = theano.function(inputs=[self.input], outputs=self.output)
        if verbose:
            logger.debug('Finished constructing the structure of ExtGrCNN Encoder: ')
            logger.debug('Size of the input dimension: %d' % fan_in)
            logger.debug('Size of the hidden dimension: %d' % fan_out)
            logger.debug('Number of gating functions: %d' % config.num_gates)
            logger.debug('Number of parameters in ExtGrCNN: %d' % self.num_params)
            logger.debug('Activation function: %s' % config.activation)

    def _step_prop(self, iter, current_level, nsteps):
        '''
        @current_level: Input matrix at current level. The first dimension corresponds to the time dimension 
        while the second dimension corresponds to the dimension of hidden representation
        '''
        # Building shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the 
        # input and output matrix, of size Txd
        left_current_level = current_level[:nsteps-iter-1]
        right_current_level = current_level[1:nsteps-iter]
        # Compute the temporary central multi-representation, of size TxKxd, where T is the dimension of 
        # time, K is the dimension of number of gates and d is the dimension of hidden representation
        multi_centrals = self.act.activate(T.dot(left_current_level, self.Wl) + 
                                           T.dot(right_current_level, self.Wr) + 
                                           self.Wb)
        # Compute the gating function, of size Tx(K+2)
        multi_gates = T.nnet.softmax(T.dot(left_current_level, self.Gl) + 
                                     T.dot(right_current_level, self.Gr) + 
                                     self.Gb)
        # Softmax-Gating combination
        multi_gates = multi_gates.dimshuffle(0, 1, 'x')
        next_level = multi_gates[:, 1:-1, :] * multi_centrals
        next_level = T.sum(next_level, axis=1)
        next_level += multi_gates[:, 0] * left_current_level + multi_gates[:, -1] * right_current_level
        return T.set_subtensor(current_level[:nsteps-iter-1], next_level)
 
    def encode(self, inputM):
        '''
        @input: Theano symbolic matrix. Compress the input matrix into output vector. The first dimension
                of inputM should correspond to the time dimension.
        '''
        hidden = T.dot(inputM, self.U)
        nsteps = inputM.shape[0]
        pyramids, _ = theano.scan(fn=self._step_prop, 
                                sequences=T.arange(nsteps-1),
                                non_sequences=nsteps, 
                                outputs_info=[hidden],
                                n_steps=nsteps-1)
        output = pyramids[-1][0].dimshuffle('x', 0)
        return output
Пример #9
0
class TransEModel(EntityScorer):
    def __init__(self,
                 entity_dim,
                 relation_num,
                 activation='iden',
                 initializer=default_initializer,
                 prefix='',
                 verbose=True):
        super(TransEModel, self).__init__()
        self.entity_dim = entity_dim
        self.relation_num = relation_num
        # (relation_num, entity_dim, entity_dim)
        self.W = shared_rand_matrix((relation_num, self.entity_dim),
                                    prefix + 'TransE_R', initializer)
        self.act = Activation(activation)
        self.params = [self.W]
        self.norm_params = [self.W]
        self.l1_norm = T.sum(T.abs_(self.W))
        self.l2_norm = T.sum(self.W**2)

        if verbose:
            logger.debug(
                'Architecture of TransE Model built finished, summarized as below:'
            )
            logger.debug('Entity Dimension: %d' % self.entity_dim)
            logger.debug('Relation Number:  %d' % self.relation_num)
            logger.debug('Initializer:      %s' % initializer)
            logger.debug('Activation:       %s' % activation)

    def score(self, e1, e2, r_index):
        """
        :param e1: (entity_dim, )
        :param e2: (entity_dim, )
        :param r_index: scalar
        :return: 
        """
        # (entity_dim, ) + (entity_dim, ) - (entity_dim, ) -> (entity_dim, )
        hidden = e1 + self.W[r_index] - e2
        # (entity_dim, ) -> scalar
        d = T.sum(hidden**2)
        return self.act.activate(d)

    def score_batch(self, e1, e2, r_index):
        """
        :param e1: (batch, entity_dim, )
        :param e2: (batch, entity_dim, )
        :param r_index: (batch, )
        :return: 
        """
        # (batch, entity_dim, ) + (batch, entity_dim, ) - (batch, entity_dim, ) -> (batch, entity_dim, )
        hidden = e1 + self.W[r_index] - e2
        d = T.sum(hidden**2, axis=1)
        return self.act.activate(d)

    def score_one_relation(self, e1, e2, r_index):
        """
        :param e1: (batch, entity_dim, )
        :param e2: (batch, entity_dim, )
        :param r_index: scalar
        :return: 
        """
        # (batch, entity_dim, ) + (batch, entity_dim, ) - (batch, entity_dim, ) -> (batch, entity_dim, )
        hidden = e1 + self.W[r_index][None, :] - e2
        d = T.sum(hidden**2, axis=1)
        return self.act.activate(d)
Пример #10
0
class RecurrentNormEncoder(object):
    def __init__(self,
                 in_dim,
                 hidden_dim,
                 pooling,
                 activation,
                 prefix="",
                 initializer=default_initializer,
                 dropout=0,
                 verbose=True):
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))
        self.in_dim = in_dim
        self.out_dim = hidden_dim
        self.hidden_dim = hidden_dim
        self.pooling = pooling
        self.dropout = dropout
        self.act = Activation(activation)
        # Composition Function Weight
        # Feed-Forward Matrix (hidden, in)
        self.W = shared_rand_matrix((8, 8), prefix + 'W_forward', initializer)
        # Bias Term (hidden)
        self.b = shared_zero_matrix((8, 8), prefix + 'b_forward')
        # Recurrent Matrix (hidden, hidden)
        self.U = shared_rand_matrix((8, 8), prefix + 'U_forward', initializer)

        self.params = [self.W, self.U, self.b]
        self.norm_params = [self.W, self.U]

        # L1, L2 Norm
        self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U))
        self.l2_norm = T.sum(self.W**2 + self.U**2)

        if verbose:
            logger.debug('Architecture of {} built finished'.format(
                self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Pooling methods:  %s' % self.pooling)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)

    def _step(self, x_t, h_t_1, w, u, b):
        """
        step function of forward
        :param x_t:   (in, )
        :param h_t_1: (hidden, )
        :param w:     (hidden, in)
        :param u:     (hidden, hidden)
        :param b:     (hidden, )
        :return:      (hidden)
        """
        # (hidden, in) (in, ) + (hidden, hidden) (hidden, ) + (hidden, ) -> hidden
        h_t = self.act.activate(T.dot(w, x_t) + T.dot(u, h_t_1) + b)
        return h_t

    def _step_batch(self, x_t, mask, h_t_1, w, u, b):
        """
        step function of forward in batch version
        :param x_t:   (batch, in)
        :param mask:  (batch, )
        :param h_t_1: (batch, hidden)
        :param w:     (hidden, in)
        :param u:     (hidden, hidden)
        :param b:     (hidden)
        :return:      (batch, hidden)
        """
        # (batch, in) (in, hidden) -> (batch, hidden)
        h_t_1 = T.reshape(h_t_1, (h_t_1.shape[0], 8, 8))
        x_t = T.reshape(x_t, (x_t.shape[0], 8, 8))
        x_t = x_t / x_t.norm(2, axis=1)[:, None, :]
        h_t = self.act.activate(T.dot(x_t, w.T) + T.dot(h_t_1, u.T) + b)
        h_t = h_t / h_t.norm(2, axis=1)[:, None, :]
        h_t_1 = T.reshape(h_t_1, (h_t_1.shape[0], 64))
        h_t = T.reshape(h_t, (h_t.shape[0], 64))
        # (batch, hidden) * (batch, None) + (batch, hidden) * (batch, None) -> (batch, hidden)
        return h_t * mask[:, None] + h_t_1 * (1 - mask[:, None])

    def forward_sequence(self, x):
        h0 = shared_zero_matrix((self.hidden_dim, ), 'h0')
        hs, _ = theano.scan(
            fn=self._step,
            sequences=x,
            outputs_info=[h0],
            non_sequences=[self.W, self.U, self.b],
        )
        return hs

    def forward_sequence_batch(self, x, mask, batch_size):
        """
        :param x: (batch, max_len, dim)
        :param mask:  (batch, max_len)
        :param batch_size:
        """
        h0 = shared_zero_matrix((batch_size, self.hidden_dim), 'h0')
        hs, _ = theano.scan(
            fn=self._step_batch,
            sequences=[
                T.transpose(
                    x, (1, 0,
                        2)),  # (batch, max_len, dim) -> (max_len, batch, dim)
                T.transpose(mask, (1, 0))
            ],  # (batch, max_len) -> (max_len, batch)
            outputs_info=[h0],
            non_sequences=[self.W, self.U, self.b],
        )
        # (max_len, batch, dim) -> (batch, max_len, dim)
        return T.transpose(hs, (1, 0, 2))

    def forward(self, x):
        """
        :param x: (len, dim)
        """
        # Use Pooling to reduce into a fixed-length representation
        return get_pooling(self.forward_sequence(x), self.pooling)

    def forward_batch(self, x, mask, batch_size):
        """
        :param x: (batch, max_len, dim)
        :param mask:  (batch, max_len)
        :param batch_size:
        """
        # Use Pooling to reduce into a fixed-length representation
        # (max_len, batch, dim) -> (batch, max_len, dim) -> (batch, dim)
        hidden = self.forward_sequence_batch(x, mask, batch_size)
        return get_pooling_batch(hidden, mask, self.pooling)
Пример #11
0
class RecurrentEncoder(AbstractRecurrentEncoder):
    def __init__(self,
                 in_dim,
                 hidden_dim,
                 pooling,
                 activation='tanh',
                 prefix="",
                 initializer=default_initializer,
                 dropout=0,
                 verbose=True):
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))
        super(RecurrentEncoder, self).__init__(in_dim, hidden_dim, pooling,
                                               activation, dropout)

        self.in_dim = in_dim
        self.out_dim = hidden_dim
        self.hidden_dim = hidden_dim
        self.pooling = pooling
        self.dropout = dropout
        self.act = Activation(activation)
        # Composition Function Weight
        # Feed-Forward Matrix (hidden, in)
        self.W = shared_rand_matrix((self.hidden_dim, self.in_dim),
                                    prefix + 'W_forward', initializer)
        # Bias Term (hidden)
        self.b = shared_zero_matrix((self.hidden_dim, ), prefix + 'b_forward')
        # Recurrent Matrix (hidden, hidden)
        self.U = shared_rand_matrix((self.hidden_dim, self.hidden_dim),
                                    prefix + 'U_forward', initializer)

        self.params = [self.W, self.U, self.b]
        self.norm_params = [self.W, self.U]

        # L1, L2 Norm
        self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U))
        self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2)

        if verbose:
            logger.debug('Architecture of {} built finished'.format(
                self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Pooling methods:  %s' % self.pooling)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)

    def _step(self, x_t, h_t_1, w, u, b):
        """
        step function of forward
        :param x_t:   (in, )
        :param h_t_1: (hidden, )
        :param w:     (hidden, in)
        :param u:     (hidden, hidden)
        :param b:     (hidden, )
        :return:      (hidden)
        """
        # (hidden, in) (in, ) + (hidden, hidden) (hidden, ) + (hidden, ) -> hidden
        h_t = self.act.activate(T.dot(w, x_t) + T.dot(u, h_t_1) + b)
        return h_t

    def _step_batch(self, x_t, mask, h_t_1, w, u, b):
        """
        step function of forward in batch version
        :param x_t:   (batch, in)
        :param mask:  (batch, )
        :param h_t_1: (batch, hidden)
        :param w:     (hidden, in)
        :param u:     (hidden, hidden)
        :param b:     (hidden)
        :return:      (batch, hidden)
        """
        # (batch, in) (in, hidden) -> (batch, hidden)
        h_t = self.act.activate(T.dot(x_t, w.T) + T.dot(h_t_1, u.T) + b)
        # (batch, hidden) * (batch, None) + (batch, hidden) * (batch, None) -> (batch, hidden)
        return h_t * mask[:, None] + h_t_1 * (1 - mask[:, None])

    def forward_scan(self, x):
        h0 = T.zeros((self.hidden_dim, ))
        hs, _ = theano.scan(
            fn=self._step,
            sequences=x,
            outputs_info=[h0],
            non_sequences=[self.W, self.U, self.b],
        )
        return hs

    def forward_scan_batch(self, x, mask):
        """
        :param x: (batch, max_len, dim)
        :param mask:  (batch, max_len)
        """
        h0 = T.zeros((x.shape[0], self.hidden_dim))
        hs, _ = theano.scan(
            fn=self._step_batch,
            sequences=[
                T.transpose(
                    x, (1, 0,
                        2)),  # (batch, max_len, dim) -> (max_len, batch, dim)
                T.transpose(mask, (1, 0))
            ],  # (batch, max_len) -> (max_len, batch)
            outputs_info=[h0],
            non_sequences=[self.W, self.U, self.b],
        )
        # (max_len, batch, dim) -> (batch, max_len, dim)
        return T.transpose(hs, (1, 0, 2))
Пример #12
0
class GRUEncoder(AbstractRecurrentEncoder):
    def __init__(self,
                 in_dim,
                 hidden_dim,
                 pooling,
                 activation='tanh',
                 gates=("sigmoid", "sigmoid"),
                 prefix="",
                 initializer=default_initializer,
                 dropout=0,
                 verbose=True):
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))
        super(GRUEncoder, self).__init__(in_dim, hidden_dim, pooling,
                                         activation, dropout)
        self.reset_gate, self.update_gate = Activation(gates[0]), Activation(
            gates[1])

        # W [reset, update, recurrent] (3 * hidden, in)
        self.W = shared_rand_matrix((self.hidden_dim * 3, self.in_dim),
                                    prefix + 'W', initializer)
        # U [reset, update, recurrent] (3 * hidden, hidden)
        self.U = shared_rand_matrix((self.hidden_dim * 3, self.hidden_dim),
                                    prefix + 'U', initializer)
        # b [reset, update, recurrent] (3 * hidden,)
        # self.b = shared_zero_matrix((self.hidden_dim * 3,), prefix + 'b')

        self.params = [self.W, self.U]  # , self.b]
        self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U))
        self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2)

        if verbose:
            logger.debug('Architecture of {} built finished'.format(
                self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Pooling methods:  %s' % self.pooling)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Reset Gate:       %s' % self.reset_gate.method)
            logger.debug('Update Gate:      %s' % self.update_gate.method)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)

    def _step(self, x_t, h_t_1, w, u):
        # (hidden * 2, in)
        reset_update_w = w[:self.hidden_dim * 2, :]
        # (hidden * 2, hidden)
        reset_update_u = u[:self.hidden_dim * 2, :]
        # (hidden, in)
        recurrent_w = w[self.hidden_dim * 2:, :]
        # (hidden, hidden)
        recurrent_u = u[self.hidden_dim * 2:, :]
        # (in,) dot (in, hidden * 2) + (hidden,) dot (hidden, hidden * 2) -> (hidden * 2)
        pre_calc = T.dot(x_t, reset_update_w.T) + T.dot(
            h_t_1, reset_update_u.T)
        # (hidden * 2) -> (hidden) (hidden)
        reset_t = self.reset_gate.activate(
            ndarray_slice(pre_calc, 0, self.hidden_dim))
        update_t = self.update_gate.activate(
            ndarray_slice(pre_calc, 1, self.hidden_dim))
        # (in,) dot (in, hidden) + [(hidden,) * (hidden,)] dot (hidden, hidden)-> (hidden, )
        g_t = T.dot(x_t, recurrent_w.T) + T.dot(h_t_1 * reset_t, recurrent_u.T)
        # (hidden,) * (hidden,) + (hidden,) * (hidden,) -> (hidden,)
        h_t = update_t * h_t_1 + (1 - update_t) * g_t
        return h_t

    def _step_batch(self, x_t, m_t, h_t_1, w, u):
        # (hidden * 2, in)
        reset_update_w = w[:self.hidden_dim * 2, :]
        # (hidden * 2, hidden)
        reset_update_u = u[:self.hidden_dim * 2, :]
        # (hidden, in)
        recurrent_w = w[self.hidden_dim * 2:, :]
        # (hidden, hidden)
        recurrent_u = u[self.hidden_dim * 2:, :]
        # (batch, in,) dot (in, hidden * 2) + (batch, hidden,) dot (hidden, hidden * 2) -> (hidden * 2)
        pre_calc = T.dot(x_t, reset_update_w.T) + T.dot(
            h_t_1, reset_update_u.T)
        # (batch, hidden * 2) -> (batch, hidden) (batch, hidden)
        reset_t = self.reset_gate.activate(
            ndarray_slice(pre_calc, 0, self.hidden_dim))
        update_t = self.update_gate.activate(
            ndarray_slice(pre_calc, 1, self.hidden_dim))
        # (batch, in,) dot (in, hidden) + [(batch, hidden,) * (batch, hidden,)] dot (hidden, hidden)-> (hidden, )
        g_t = T.dot(x_t, recurrent_w.T) + T.dot(h_t_1 * reset_t, recurrent_u.T)
        # (batch, hidden,) * (batch, hidden,) + (batch, hidden,) * (batch, hidden,) -> (batch, hidden,)
        h_t = update_t * h_t_1 + (1 - update_t) * g_t
        # (batch, :) * (batch, hidden,) + (batch, :) * (batch, hidden,)
        h_t = m_t[:, None] * h_t + (1. - m_t)[:, None] * h_t_1
        # (batch, hidden)
        return h_t

    def forward_scan(self, x):
        h0 = shared_zero_matrix((self.hidden_dim, ), 'h0_forward')
        hs, _ = theano.scan(
            fn=self._step,
            sequences=x,
            outputs_info=[h0],
            non_sequences=[self.W, self.U],
        )
        return hs

    def forward_scan_batch(self, x, mask):
        h0 = T.zeros((x.shape[0], self.hidden_dim))
        hs, _ = theano.scan(
            fn=self._step_batch,
            sequences=[T.transpose(x, (1, 0, 2)),
                       T.transpose(mask, (1, 0))],
            outputs_info=[
                h0,
            ],
            non_sequences=[self.W, self.U],
        )
        return T.transpose(hs, (1, 0, 2))
Пример #13
0
class LSTMEncoder(AbstractRecurrentEncoder):
    def __init__(self,
                 in_dim,
                 hidden_dim,
                 pooling,
                 activation='tanh',
                 gates=("sigmoid", "sigmoid", "sigmoid"),
                 prefix="",
                 initializer=OrthogonalInitializer(),
                 dropout=0,
                 verbose=True):
        if verbose:
            logger.debug('Building {}...'.format(self.__class__.__name__))
        super(LSTMEncoder, self).__init__(in_dim, hidden_dim, pooling,
                                          activation, dropout)
        self.in_gate, self.forget_gate, self.out_gate = Activation(
            gates[0]), Activation(gates[1]), Activation(gates[2])

        # W [in, forget, output, recurrent] (4 * hidden, in)
        self.W = shared_rand_matrix((self.hidden_dim * 4, self.in_dim),
                                    prefix + 'W', initializer)
        # U [in, forget, output, recurrent] (4 * hidden, hidden)
        self.U = shared_rand_matrix((self.hidden_dim * 4, self.hidden_dim),
                                    prefix + 'U', initializer)
        # b [in, forget, output, recurrent] (4 * hidden,)
        self.b = shared_zero_matrix((self.hidden_dim * 4, ), prefix + 'b')

        self.params = [self.W, self.U, self.b]
        self.l1_norm = T.sum(T.abs_(self.W)) + T.sum(T.abs_(self.U))
        self.l2_norm = T.sum(self.W**2) + T.sum(self.U**2)

        if verbose:
            logger.debug('Architecture of {} built finished'.format(
                self.__class__.__name__))
            logger.debug('Input dimension:  %d' % self.in_dim)
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Pooling methods:  %s' % self.pooling)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Input Gate:       %s' % self.in_gate.method)
            logger.debug('Forget Gate:      %s' % self.forget_gate.method)
            logger.debug('Output Gate:      %s' % self.out_gate.method)
            logger.debug('Activation Func:  %s' % self.act.method)
            logger.debug('Dropout Rate:     %f' % self.dropout)

    def _step(self, x_t, h_t_1, c_t_1, w, u, b):
        pre_calc = T.dot(w, x_t) + T.dot(u, h_t_1) + b
        i_t = self.in_gate.activate(ndarray_slice(pre_calc, 0,
                                                  self.hidden_dim))
        f_t = self.forget_gate.activate(
            ndarray_slice(pre_calc, 1, self.hidden_dim))
        o_t = self.out_gate.activate(
            ndarray_slice(pre_calc, 2, self.hidden_dim))
        g_t = self.act.activate(ndarray_slice(pre_calc, 3, self.hidden_dim))
        c_t = f_t * c_t_1 + i_t * g_t
        h_t = o_t * self.act.activate(c_t)
        return h_t, c_t

    def _step_batch(self, x_t, m_t, h_t_1, c_t_1, w, u, b):
        # (batch, in) (in, hidden * 4) + (hidden, in) (in, hidden * 4) + (hidden * 4)
        #   -> (batch, hidden * 4)
        pre_calc = T.dot(x_t, w.T) + T.dot(h_t_1, u.T) + b
        # (batch, hidden * 4) -> (batch, hidden) (batch, hidden) (batch, hidden) (batch, hidden)
        i_t = self.in_gate.activate(ndarray_slice(pre_calc, 0,
                                                  self.hidden_dim))
        f_t = self.forget_gate.activate(
            ndarray_slice(pre_calc, 1, self.hidden_dim))
        o_t = self.out_gate.activate(
            ndarray_slice(pre_calc, 2, self.hidden_dim))
        g_t = self.act.activate(ndarray_slice(pre_calc, 3, self.hidden_dim))
        # (batch, hidden) * (batch, hidden) + (batch, hidden) * (batch, hidden)
        # -> (batch, hidden)
        c_t = f_t * c_t_1 + i_t * g_t
        # (batch, hidden) * (batch, hidden) -> (batch, hidden)
        h_t = o_t * self.act.activate(c_t)
        c_t = m_t[:, None] * c_t + (1. - m_t)[:, None] * c_t_1
        h_t = m_t[:, None] * h_t + (1. - m_t)[:, None] * h_t_1
        # (batch, hidden) (batch, hidden)
        return h_t, c_t

    def forward_scan(self, x):
        h0 = shared_zero_matrix((self.hidden_dim, ), 'h0_forward')
        c0 = shared_zero_matrix((self.hidden_dim, ), 'c0_forward')
        hs, _ = theano.scan(
            fn=self._step,
            sequences=x,
            outputs_info=[h0, c0],
            non_sequences=[self.W, self.U, self.b],
        )
        return hs[0]

    def forward_scan_batch(self, x, mask):
        h0 = T.zeros((x.shape[0], self.hidden_dim))
        c0 = T.zeros((x.shape[0], self.hidden_dim))
        hs, _ = theano.scan(
            fn=self._step_batch,
            sequences=[T.transpose(x, (1, 0, 2)),
                       T.transpose(mask, (1, 0))],
            outputs_info=[h0, c0],
            non_sequences=[self.W, self.U, self.b],
        )
        return T.transpose(hs[0], (1, 0, 2))
Пример #14
0
class RecursiveEncoder(object):
    def __init__(self,
                 in_dim,
                 hidden_dim,
                 initializer=default_initializer,
                 normalize=True,
                 dropout=0,
                 reconstructe=True,
                 activation="tanh",
                 verbose=True):
        """
        :param in_dim:          输入维度
        :param hidden_dim:      隐层维度
        :param initializer:     随机初始化器
        :param normalize:       是否归一化
        :param dropout:         dropout率
        :param activation:      激活函数
        :param verbose:         是否输出Debug日志内容
        :return:
        """
        self.in_dim = in_dim
        self.out_dim = hidden_dim
        self.hidden_dim = hidden_dim
        assert self.in_dim == self.hidden_dim

        self.initializer = initializer
        self.normalize = normalize
        self.dropout = dropout
        self.verbose = verbose
        self.act = Activation(activation)
        # Composition Function Weight
        # (dim, 2 * dim)
        self.W = shared_rand_matrix((self.hidden_dim, 2 * self.in_dim),
                                    'W',
                                    initializer=initializer)
        # (dim, )
        self.b = shared_zero_matrix((self.hidden_dim, ), 'b')
        # Reconstruction Function Weight
        # (2 * dim, dim)
        self.Wr = shared_rand_matrix((2 * self.in_dim, self.hidden_dim),
                                     'Wr',
                                     initializer=initializer)
        # (2 * dim, )
        self.br = shared_zero_matrix((self.in_dim * 2, ), 'br')
        self.params = [self.W, self.b, self.Wr, self.br]
        self.norm_params = [self.W, self.Wr]

        self.l1_norm = sum(
            [T.sum(T.abs_(param)) for param in self.norm_params])
        self.l2_norm = sum([T.sum(param**2) for param in self.norm_params])

        if verbose:
            logger.debug(
                'Architecture of RAE built finished, summarized as below: ')
            logger.debug('Hidden dimension: %d' % self.hidden_dim)
            logger.debug('Normalize:        %s' % self.normalize)
            logger.debug('Activation:       %s' % self.act)
            logger.debug('Dropout Rate:     %s' % self.dropout)

    def compose(self, left_v, right_v):
        v = T.concatenate([left_v, right_v])
        z = self.act.activate(self.b + T.dot(self.W, v))
        if self.normalize:
            z = z / z.norm(2)
        r = self.act.activate(self.br + T.dot(self.Wr, z))
        w_left_r, w_right_r = r[:self.hidden_dim], r[self.hidden_dim:]
        if self.normalize:
            w_left_r = w_left_r / w_left_r.norm(2)
            w_right_r = w_right_r / w_right_r.norm(2)
        loss_rec = T.sum((w_left_r - left_v)**2) + T.sum(
            (w_right_r - right_v)**2)
        return z, loss_rec

    def encode(self, seq, vecs, loss_rec):
        # vecs[t[0]] and vecs[t[0]] ==> vecs[t[2]]
        w_left, w_right = vecs[seq[0]], vecs[seq[1]]
        z, loss_rec = self.compose(w_left, w_right)
        return T.set_subtensor(vecs[seq[2]], z), loss_rec

    def forward(self, x, seq):
        """
        :param x:   (length, dim)
        :param seq: (length - 1, 3)
        :return:
        """
        # (length, dim) -> (2 * length - 1, dim)
        vector = T.concatenate([x, T.zeros_like(x)[:-1, :]], axis=0)
        # vector = theano.printing.Print()(vector)
        # scan length-1 times
        hs, _ = theano.scan(fn=self.encode,
                            sequences=seq,
                            outputs_info=[vector, shared_scalar(0)],
                            name="compose_phrase")
        comp_vec_init = hs[0][-1][-1]
        comp_rec_init = T.sum(hs[1])
        if self.normalize:
            hidden = x[0] / x[0].norm(2)
        else:
            hidden = x[0]
        comp_vec = ifelse(x.shape[0] > 1, comp_vec_init, hidden)
        comp_rec = ifelse(x.shape[0] > 1, comp_rec_init, shared_zero_scalar())
        return comp_vec, comp_rec

    def compose_batch(self, left, right, W, b, Wr, br):
        """
        合成函数代表一个Batch中的其中一个合成过程
        :param left:  (batch, dim)
        :param right: (batch, dim)
        :param W:     (dim, dim)
        :param b:     (dim, )
        :param Wr:    (dim, dim)
        :param br:    (dim,)
        :return:
        """
        v = T.concatenate(
            [left, right],
            axis=1)  # [(batch, dim) (batch, dim)] -> (batch, 2 * dim)
        z = self.act.activate(b + T.dot(
            v, W.T))  # (batch, 2 * dim) dot (dim, 2 * dim)T -> (batch, dim)
        if self.normalize:
            z = z / (z.norm(2, axis=1)[:, None] + epsilon
                     )  # (batch, dim) -> (batch, dim) normalize by row
        r = self.act.activate(br + T.dot(
            z, Wr.T))  # (batch, dim) dot (2 * dim, dim)T -> (batch, 2 * dim)
        # (batch, 2 * dim) -> [(batch, dim) (batch. dim)]
        left_r, right_r = r[:, :self.hidden_dim], r[:, self.hidden_dim:]
        if self.normalize:
            # (batch, dim) -> (batch, dim) normalize by row
            left_r /= (left_r.norm(2, axis=1)[:, None] + epsilon)
            # (batch, dim) -> (batch, dim) normalize by row
            right_r /= (right_r.norm(2, axis=1)[:, None] + epsilon)
        # (batch, )
        loss_rec = T.sum((left_r - left)**2, axis=1) + T.sum(
            (right_r - right)**2, axis=1)
        # (batch, dim) (batch)
        return z, loss_rec

    def encode_batch(self, _seq, _mask, _input, _pre, loss_rec, W, b, Wr, br,
                     range_index):
        """
        batch合成短语表示过程中 单词循环执行的函数
        :param _seq:   (batch, 3)
        :param _mask:  (batch, )
        :param _input: (batch, word * 2 - 1, dim)
        :param _pre:   (batch, dim)
        :param loss_rec: (batch, )
        :param W:      (dim, dim)
        :param b:      (dim, )
        :param Wr:     (dim, dim)
        :param br:     (dim,)
        :return:       (batch, dim)
        """
        left = _seq[:, 0]
        right = _seq[:, 1]
        # (batch, dim)
        # left_vec = _input[T.arange(self.batch), left]
        left_vec = _input[range_index, left]
        # (batch, dim)
        right_vec = _input[range_index, right]
        # (batch, dim) (batch, dim) -> (batch, 2 * dim), (batch, )
        left_right, loss_rec = self.compose_batch(left_vec, right_vec, W, b,
                                                  Wr, br)
        # (batch, 2 * dim)
        # 若掩码已为0 则代表已经超出原短语长度 此为多余计算 直接去上一轮结果作为该轮结果
        left_right = _mask[:, None] * left_right + (1. - _mask[:, None]) * _pre
        # (batch, )
        # 若掩码已为0 则代表已经超出原短语长度 此为多余计算 用0掩码消去
        loss_rec *= _mask
        # (batch, word * 2 - 1, dim), (batch, dim), (batch, )
        return T.set_subtensor(_input[range_index, _seq[:, 2]],
                               left_right), left_right, loss_rec

    def forward_batch(self, x, mask, seqs):
        """
        :param x:    (batch, length, dim)
        :param mask: (batch, length)
        :param seqs: (batch, length - 1, 3)
        :return:
        """
        zeros_rec = T.zeros((x.shape[0], ))
        # (batch, length, dim) -> (batch, 2 * length - 1, dim)
        vector = T.concatenate([x, T.zeros_like(x)[:, :-1, :]], axis=1)
        # scan仅能循环扫描张量的第一维 故转置输入的张量
        # (batch, length - 1, 3) -> (length - 1, batch, 3)
        seqs = T.transpose(seqs, axes=(1, 0, 2))
        # (batch, length - 1) -> (length - 1, batch)
        mask = T.transpose(mask, axes=(1, 0))
        range_index = T.arange(x.shape[0])
        result, _ = theano.scan(
            fn=self.encode_batch,  # 编码函数,对batch数量的短语进行合成
            sequences=[seqs, mask[1:]],  # 扫描合成路径和掩码
            # 因合成次数为短语长度-1 所以对于长度为1的短语,掩码第一次循环即为0
            # 故取vector的第0维(第一个词)作为初始值,直接返回
            outputs_info=[vector, vector[:, 0, :], zeros_rec],
            non_sequences=[self.W, self.b, self.Wr, self.br, range_index],
            name="compose_scan")
        phrases, pres, loss_recs = result
        # (word - 1, batch, dim) -> (batch, dim)
        # 最后一次合成扫描返回的结果为最终表示
        phrases = pres[-1]
        sum_loss_recs = T.sum(loss_recs, axis=0)
        # (batch, dim)
        # 归一化
        if self.normalize:
            phrases = phrases / phrases.norm(2, axis=1)[:, None]
        return phrases, sum_loss_recs
Пример #15
0
class NeuralTensorModel(EntityScorer):
    def __init__(self,
                 entity_dim,
                 relation_num,
                 activation='tanh',
                 hidden=5,
                 keep_normal=False,
                 initializer=default_initializer,
                 prefix='',
                 verbose=True):
        super(NeuralTensorModel, self).__init__()
        self.entity_dim = entity_dim
        self.relation_num = relation_num
        self.hidden = hidden
        self.slice_seq = T.arange(hidden)
        self.keep_normal = keep_normal
        # (relation_num, entity_dim, entity_dim, hidden)
        self.W = shared_rand_matrix(
            (relation_num, self.entity_dim, self.entity_dim, self.hidden),
            prefix + 'NTN_W', initializer)
        # (relation_num, hidden)
        self.U = shared_ones_matrix((relation_num, self.hidden),
                                    name=prefix + 'NTN_U')
        if keep_normal:
            # (relation_num, entity_dim, hidden)
            self.V = shared_rand_matrix(
                (relation_num, self.entity_dim * 2, self.hidden),
                prefix + 'NTN_V', initializer)
            # (relation_num, hidden)
            self.b = shared_zero_matrix((relation_num, self.hidden),
                                        name=prefix + 'NTN_B')
            self.params = [self.W, self.V, self.U, self.b]
            self.norm_params = [self.W, self.V, self.U, self.b]
        else:
            self.params = [self.W]
            self.norm_params = [self.W]
        self.act = Activation(activation)
        self.l1_norm = T.sum(
            [T.sum(T.abs_(param)) for param in self.norm_params])
        self.l2_norm = T.sum([T.sum(param**2) for param in self.norm_params])

        if verbose:
            logger.debug(
                'Architecture of Tensor Model built finished, summarized as below:'
            )
            logger.debug('Entity Dimension: %d' % self.entity_dim)
            logger.debug('Hidden Dimension: %d' % self.hidden)
            logger.debug('Relation Number:  %d' % self.relation_num)
            logger.debug('Initializer:      %s' % initializer)
            logger.debug('Activation:       %s' % activation)

    @staticmethod
    def step(_slice, e1, e2, w):
        """
        :param _slice: scalar
        :param e1: (entity_dim, )
        :param e2: (entity_dim, )
        :param w : (entity_dim, entity_dim, hidden)
        :return: 
        """
        # (entity_dim, ) dot (entity_dim, entity_dim) dot (entiy_dim) -> scalar
        return T.dot(e1, T.dot(w[_slice], e2))

    @staticmethod
    def step_relation(_slice, e1, e2, w):
        """
        :param _slice: scalar
        :param e1: (batch, entity_dim)
        :param e2: (batch, entity_dim)
        :param w : (entity_dim, entity_dim, hidden)
        :return: 
        """
        # (batch, entity_dim, ) dot (entity_dim, entity_dim) -> (batch, entity_dim)
        hidden = T.dot(e1, w[:, :, _slice])
        # (batch, entity_dim) dot (batch, entity_dim, ) -> (batch, )
        hidden = T.sum(hidden * e2, axis=1)
        return hidden

    @staticmethod
    def step_batch(_slice, e1, e2, w):
        """
        :param _slice: scalar
        :param e1: (batch, entity_dim)
        :param e2: (batch, entity_dim)
        :param w : (batch, entity_dim, entity_dim, hidden)
        :return: 
        """
        # (batch, entity_dim, ) dot (batch, entity_dim, entity_dim) -> (batch, entity_dim)
        hidden = T.batched_dot(e1, w[:, :, :, _slice])
        # (batch, entity_dim) dot (batch, entity_dim, ) -> (batch, )
        hidden = T.sum(hidden * e2, axis=1)
        return hidden

    def score(self, e1, e2, r_index):
        """
        :param e1: (entity_dim, )
        :param e2: (entity_dim, )
        :param r_index: scalar
        :return: 
        """
        # (entity_dim, ) dot (entity_dim, entity_dim, hidden) dot (entity_dim, ) -> (hidden, )
        hidden1_sep, _ = theano.scan(fn=self.step,
                                     sequences=[self.slice_seq],
                                     non_sequences=[e1, e2, self.W[r_index]],
                                     name='single_scan')
        hidden1 = T.concatenate([hidden1_sep])
        if self.keep_normal:
            # (2 * entity_dim, ) dot (2 * entity_dim, hidden) -> (hidden, )
            hidden2 = T.dot(T.concatenate([e1, e2]), self.V[r_index])
            # (hidden, ) + (hidden, ) + (hidden, ) -> (hidden, )
            hidden = hidden1 + hidden2 + self.b[r_index]
        else:
            hidden = hidden1
        # (hidden, ) -> (hidden, )
        act_hidden = self.act.activate(hidden)
        # (hidden, ) dot (hidden, ) -> scalar
        return T.dot(act_hidden, self.U[r_index])

    def score_batch(self, e1, e2, r_index):
        """
        :param e1: (batch, entity_dim, )
        :param e2: (batch, entity_dim, )
        :param r_index: (batch, )
        :return: 
        """
        # (batch, entity_dim) dot (batch, entity_dim, entity_dim, hidden) dot (batch, entity_dim) -> hidden * (batch, )
        hidden1_sep, _ = theano.scan(fn=self.step_batch,
                                     sequences=[self.slice_seq],
                                     non_sequences=[e1, e2, self.W[r_index]],
                                     name='batch_scan')
        # hidden * (batch, ) -> (batch, hidden)
        hidden1 = T.concatenate([hidden1_sep], axis=1).transpose()
        if self.keep_normal:
            # (batch, 2 * entity_dim) dot (batch, 2 * entity_dim, hidden) -> (batch, hidden, )
            hidden2 = T.batched_dot(T.concatenate([e1, e2], axis=1),
                                    self.V[r_index])
            # (batch, hidden) + (batch, hidden) + (batch, hidden) -> (batch, hidden)
            hidden = hidden1 + hidden2 + self.b[r_index]
        else:
            hidden = hidden1
        # (batch, hidden) -> (batch, hidden)
        act_hidden = self.act.activate(hidden)
        # (batch, hidden) dot (batch, hidden) -> (batch, )
        return T.sum(act_hidden * self.U[r_index], axis=1)

    def score_one_relation(self, e1, e2, r_index):
        """
        :param e1: (batch, entity_dim, )
        :param e2: (batch, entity_dim, )
        :param r_index: scalar
        :return: 
        """
        # (batch, entity_dim) dot (entity_dim, entity_dim, hidden) dot (batch, entity_dim) -> hidden * (batch, )
        hidden1_sep, _ = theano.scan(fn=self.step_relation,
                                     sequences=self.slice_seq,
                                     non_sequences=[e1, e2, self.W[r_index]],
                                     name='relation_scan')
        # hidden * (batch, ) -> (batch, hidden)
        hidden1 = T.concatenate([hidden1_sep], axis=1).transpose()
        if self.keep_normal:
            # (batch, 2 * entity_dim) dot (2 * entity_dim, hidden) -> (batch, hidden)
            hidden2 = T.dot(T.concatenate([e1, e2], axis=1), self.V[r_index])
            # (batch, hidden) + (batch, hidden) + (hidden) -> (batch, hidden)
            hidden = hidden1 + hidden2 + self.b[r_index][None, :]
        else:
            hidden = hidden1
        # (batch, hidden) -> (batch, hidden)
        act_hidden = self.act.activate(hidden)
        # (batch, hidden) dot (batch, hidden) -> (batch, )
        return T.sum(act_hidden * self.U[r_index], axis=1)
Пример #16
0
class BRNN(object):
    '''
	Bidirectional RNN. This is just a trial for using 
	BRNN as a tool for sentence modeling.

	First trial on the task of sentiment analysis.
	'''
    def __init__(self, configs, verbose=True):
        if verbose:
            pprint('Build Tied weights Bidirectional Recurrent Neural Network')
        self.input = T.matrix(name='input')
        self.truth = T.ivector(name='label')
        self.learn_rate = T.scalar(name='learn rate')
        # Configure Activation function
        self.act = Activation(configs.activation)
        # Build bidirectional RNN with tied weights
        num_input, num_hidden, num_class = configs.num_input, configs.num_hidden, configs.num_class
        # Stack all the variables together into a vector in order to apply the batch updating algorithm
        # Since there are two directions for the RNN, all the weight matrix associated with RNN will be
        # duplicated
        num_params = 2 * (num_input * num_hidden + \
            num_hidden * num_hidden + \
            num_hidden) + \
            2 * num_hidden * num_class + \
            num_class
        self.num_params = num_params
        self.theta = theano.shared(value=np.zeros(num_params, dtype=floatX),
                                   name='theta',
                                   borrow=True)
        # Incremental index
        param_idx = 0
        # 1, Feed-forward matrix for forward direction: W_forward
        self.W_forward = self.theta[param_idx:param_idx +
                                    num_input * num_hidden].reshape(
                                        (num_input, num_hidden))
        self.W_forward.name = 'W_forward_RNN'
        W_forward_init = np.asarray(np.random.uniform(
            low=-np.sqrt(6.0 / (num_input + num_hidden)),
            high=np.sqrt(6.0 / (num_input + num_hidden)),
            size=(num_input, num_hidden)),
                                    dtype=floatX)
        param_idx += num_input * num_hidden
        # 1, Feed-forward matrix for backward direction: W_backward
        self.W_backward = self.theta[param_idx:param_idx +
                                     num_input * num_hidden].reshape(
                                         (num_input, num_hidden))
        self.W_backward.name = 'W_backward_RNN'
        W_backward_init = np.asarray(np.random.uniform(
            low=-np.sqrt(6.0 / (num_input + num_hidden)),
            high=np.sqrt(6.0 / (num_input + num_hidden)),
            size=(num_input, num_hidden)),
                                     dtype=floatX)
        param_idx += num_input * num_hidden
        # 2, Recurrent matrix for forward direction: U_forward
        self.U_forward = self.theta[param_idx:param_idx +
                                    num_hidden * num_hidden].reshape(
                                        (num_hidden, num_hidden))
        self.U_forward.name = 'U_forward_RNN'
        U_forward_init = np.asarray(np.random.uniform(
            low=-np.sqrt(6.0 / (num_hidden + num_hidden)),
            high=np.sqrt(6.0 / (num_hidden + num_hidden)),
            size=(num_hidden, num_hidden)),
                                    dtype=floatX)
        param_idx += num_hidden * num_hidden
        # 2, Recurrent matrix for backward direction: U_backward
        self.U_backward = self.theta[param_idx:param_idx +
                                     num_hidden * num_hidden].reshape(
                                         (num_hidden, num_hidden))
        self.U_backward.name = 'U_backward_RNN'
        U_backward_init = np.asarray(np.random.uniform(
            low=-np.sqrt(6.0 / (num_hidden + num_hidden)),
            high=np.sqrt(6.0 / (num_hidden + num_hidden)),
            size=(num_hidden, num_hidden)),
                                     dtype=floatX)
        param_idx += num_hidden * num_hidden
        # 3, Bias parameter for the hidden-layer forward direction RNN
        self.b_forward = self.theta[param_idx:param_idx + num_hidden]
        self.b_forward.name = 'b_forward_RNN'
        b_forward_init = np.zeros(num_hidden, dtype=floatX)
        param_idx += num_hidden
        # 3, Bias parameter for the hidden-layer backward direction RNN
        self.b_backward = self.theta[param_idx:param_idx + num_hidden]
        self.b_backward.name = 'b_backward_RNN'
        b_backward_init = np.zeros(num_hidden, dtype=floatX)
        param_idx += num_hidden
        # Weight matrix for softmax function
        self.W_softmax = self.theta[param_idx:param_idx +
                                    2 * num_hidden * num_class].reshape(
                                        (2 * num_hidden, num_class))
        self.W_softmax.name = 'W_softmax'
        W_softmax_init = np.asarray(np.random.uniform(
            low=-np.sqrt(6.0 / (2 * num_hidden + num_class)),
            high=np.sqrt(6.0 / (2 * num_hidden + num_class)),
            size=(2 * num_hidden, num_class)),
                                    dtype=floatX)
        param_idx += 2 * num_hidden * num_class
        # Bias vector for softmax function
        self.b_softmax = self.theta[param_idx:param_idx + num_class]
        self.b_softmax.name = 'b_softmax'
        b_softmax_init = np.zeros(num_class, dtype=floatX)
        param_idx += num_class
        # Set all the default parameters into theta
        self.theta.set_value(
            np.concatenate([
                x.ravel()
                for x in (W_forward_init, W_backward_init, U_forward_init,
                          U_backward_init, b_forward_init, b_backward_init,
                          W_softmax_init, b_softmax_init)
            ]))
        assert param_idx == num_params
        # h[0], zero vector, treated as constants
        self.h_start = theano.shared(value=np.zeros(num_hidden, dtype=floatX),
                                     name='h_start',
                                     borrow=True)
        self.h_end = theano.shared(value=np.zeros(num_hidden, dtype=floatX),
                                   name='h_end',
                                   borrow=True)

        # recurrent function used to compress a sequence of input vectors
        # the first dimension should correspond to time
        def forward_step(x_t, h_tm1):
            h_t = self.act.activate(T.dot(x_t, self.W_forward) + \
                  T.dot(h_tm1, self.U_forward) + self.b_forward)
            return h_t

        def backward_step(x_t, h_tm1):
            h_t = self.act.activate(T.dot(x_t, self.W_backward) + \
                  T.dot(h_tm1, self.U_backward) + self.b_backward)
            return h_t

        # Forward and backward representation over time
        self.forward_h, _ = theano.scan(fn=forward_step,
                                        sequences=self.input,
                                        outputs_info=[self.h_start],
                                        truncate_gradient=configs.bptt)
        self.backward_h, _ = theano.scan(fn=backward_step,
                                         sequences=self.input,
                                         outputs_info=[self.h_end],
                                         truncate_gradient=configs.bptt,
                                         go_backwards=True)
        # Store the final value
        # self.h_start_star = self.forward_h[-1]
        # self.h_end_star = self.backward_h[-1]
        self.h_start_star = T.mean(self.forward_h, axis=0)
        self.h_end_star = T.mean(self.backward_h, axis=0)
        # L1, L2 regularization
        self.L1_norm = T.sum(T.abs_(self.W_forward) + T.abs_(self.W_backward) + \
              T.abs_(self.U_forward) + T.abs_(self.U_backward) + \
              T.abs_(self.W_softmax))
        self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \
              T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2) + \
              T.sum(self.W_softmax ** 2)
        # Build function to show the learned representation for different sentences
        self.show_forward = theano.function(inputs=[self.input],
                                            outputs=self.h_start_star)
        self.show_backward = theano.function(inputs=[self.input],
                                             outputs=self.h_end_star)
        ##################################################################################
        # Correlated BRNN
        ##################################################################################
        # Concatenate these two vectors into one
        self.h = T.concatenate([self.h_start_star, self.h_end_star], axis=0)
        # Dropout parameter
        srng = T.shared_randomstreams.RandomStreams(configs.random_seed)
        mask = srng.binomial(n=1, p=1 - configs.dropout, size=self.h.shape)
        self.h *= T.cast(mask, floatX)
        # Use concatenated vector as input to the Softmax/MLP classifier
        self.output = T.nnet.softmax(
            T.dot(self.h, self.W_softmax) + self.b_softmax)
        self.pred = T.argmax(self.output, axis=1)
        # Build cost function
        self.cost = -T.mean(
            T.log(self.output)[T.arange(self.truth.shape[0]), self.truth])
        if configs.regularization:
            self.cost += configs.lambda1 * self.L2_norm
        # Compute gradient
        self.gradtheta = T.grad(self.cost, self.theta)
        self.gradinput = T.grad(self.cost, self.input)
        # Build objective function
        # Compute the gradients to parameters
        self.compute_cost_and_gradient = theano.function(
            inputs=[self.input, self.truth],
            outputs=[self.cost, self.gradtheta])
        # Compute the gradients to inputs
        self.compute_input_gradient = theano.function(
            inputs=[self.input, self.truth], outputs=self.gradinput)
        # Build prediction function
        self.predict = theano.function(inputs=[self.input], outputs=self.pred)
        if verbose:
            pprint('*' * 50)
            pprint(
                'Finished constructing Bidirectional Recurrent Neural Network (BRNN)'
            )
            pprint('Size of input dimension: %d' % configs.num_input)
            pprint('Size of hidden/recurrent dimension: %d' %
                   configs.num_hidden)
            pprint('Size of output dimension: %d' % configs.num_class)
            pprint('Is regularization applied? %s' %
                   ('yes' if configs.regularization else 'no'))
            if configs.regularization:
                pprint('Coefficient of regularization term: %f' %
                       configs.lambda1)
            pprint('BPTT step: %d' % configs.bptt)
            pprint('Number of free parameters in BRNN: %d' % self.num_params)
            pprint('*' * 50)

    # This method is used to implement the batch updating algorithm
    def update_params(self, gradtheta, learn_rate):
        # gradparams is a single long vector which can be used to update self.theta
        # Learning algorithm: simple stochastic gradient descent
        theta = self.theta.get_value(borrow=True)
        self.theta.set_value(theta - learn_rate * gradtheta, borrow=True)

    @staticmethod
    def save(fname, model):
        with file(fname, 'wb') as fout:
            cPickle.dump(model, fout)

    @staticmethod
    def load(fname):
        with file(fname, 'rb') as fin:
            return cPickle.load(fin)
Пример #17
0
class SingleLayerModel(EntityScorer):
    def __init__(self,
                 entity_dim,
                 relation_num,
                 hidden=50,
                 activation='tanh',
                 initializer=default_initializer,
                 prefix='',
                 verbose=True):
        super(SingleLayerModel, self).__init__()
        self.hidden = hidden
        self.entity_dim = entity_dim
        self.relation_num = relation_num
        # (relation_num, k, entity_dim)
        self.W_1 = shared_rand_matrix(
            (relation_num, self.hidden, self.entity_dim),
            prefix + 'SingleLayer_W1', initializer)
        # (relation_num, k, entity_dim)
        self.W_2 = shared_rand_matrix(
            (relation_num, self.hidden, self.entity_dim),
            prefix + 'SingleLayer_W2', initializer)
        # (relation_num, k, )
        self.u = shared_ones_matrix((
            relation_num,
            self.hidden,
        ), prefix + 'SingleLayer_u')
        self.act = Activation(activation)
        self.params = [self.W_1, self.W_2, self.u]
        self.norm_params = [self.W_1, self.W_2, self.u]
        self.l1_norm = T.sum(T.abs_(self.W_1)) + T.sum(T.abs_(
            self.W_2)) + T.sum(T.abs_(self.u))
        self.l2_norm = T.sum(self.W_1**2) + T.sum(self.W_2**2) + T.sum(self.u**
                                                                       2)

        if verbose:
            logger.debug(
                'Architecture of Single Layer Model built finished, summarized as below:'
            )
            logger.debug('Entity Dimension: %d' % self.entity_dim)
            logger.debug('Hidden Dimension: %d' % self.hidden)
            logger.debug('Relation Number:  %d' % self.relation_num)
            logger.debug('Initializer:      %s' % initializer)
            logger.debug('Activation:       %s' % activation)

    def score(self, e1, e2, r_index):
        """
        :param e1: (entity_dim, )
        :param e2: (entity_dim, )
        :param r_index: scalar
        :return: 
        """
        # (hidden, entity_dim) dot (entity_dim) + (hidden, entity_dim) dot (entity_dim) -> (hidden, )
        hidden = T.dot(self.W_1[r_index], e1) + T.dot(self.W_2[r_index], e2)
        # (hidden, ) -> (hidden, )
        act_hidden = self.act.activate(hidden)
        # (hidden, ) dot (hidden, ) -> 1
        return T.dot(self.u[r_index], act_hidden)

    def score_batch(self, e1, e2, r_index):
        """
        :param e1: (batch, entity_dim, )
        :param e2: (batch, entity_dim, )
        :param r_index: (batch, )
        :return: 
        """
        # (batch, hidden, entity_dim) dot (batch, entity_dim) + (batch, hidden, entity_dim) dot (batch, entity_dim)
        hidden = T.batched_dot(self.W_1[r_index], e1)
        hidden += T.batched_dot(self.W_2[r_index], e2)
        # (batch, hidden) -> (batch, hidden)
        act_hidden = self.act.activate(hidden)
        # (batch, hidden) dot (batch, hidden) -> (batch, )
        return T.sum(act_hidden * self.u[r_index], axis=1)

    def score_one_relation(self, e1, e2, r_index):
        """
        :param e1: (batch, entity_dim, )
        :param e2: (batch, entity_dim, )
        :param r_index: scalar
        :return: 
        """
        # (batch, entity_dim) dot (entity_dim, hidden) + (batch, entity_dim) dot (entity_dim, hidden) -> (batch, hidden)
        hidden = T.dot(e1, self.W_1[r_index].transpose()) + T.dot(
            e2, self.W_2[r_index].transpose())
        # (batch, hidden) -> (batch, hidden)
        act_hidden = self.act.activate(hidden)
        # (batch, hidden) dot (hidden, ) -> (batch, )
        return T.dot(act_hidden, self.u[r_index])
Пример #18
0
class RNN(object):
    '''
	Basic component for Recurrent Neural Network
	'''
    def __init__(self, configs=None, verbose=True):
        '''
		Basic RNN is an unsupervised component, where the input is a sequence and the 
		output is a vector with fixed length
		'''
        if verbose: pprint('Build Recurrent Neural Network...')
        self.input = T.matrix(name='input', dtype=floatX)
        self.learn_rate = T.scalar(name='learn rate')
        # Configure activation function
        self.act = Activation(configs.activation)
        fan_in = configs.num_input
        fan_out = configs.num_hidden
        # Initialize all the variables in RNN, including:
        # 1, Feed-forward matrix, feed-forward bias, W, W_b
        # 2, Recurrent matrix, recurrent bias, U, U_b
        self.W = theano.shared(value=np.asarray(np.random.uniform(
            low=-np.sqrt(6.0 / (fan_in + fan_out)),
            high=np.sqrt(6.0 / (fan_in + fan_out)),
            size=(fan_in, fan_out)),
                                                dtype=floatX),
                               name='W',
                               borrow=True)
        self.U = theano.shared(value=np.asarray(np.random.uniform(
            low=-np.sqrt(6.0 / (fan_out + fan_out)),
            high=np.sqrt(6.0 / (fan_out + fan_out)),
            size=(fan_out, fan_out)),
                                                dtype=floatX),
                               name='U',
                               borrow=True)
        # Bias parameter for the hidden-layer encoder of RNN
        self.b = theano.shared(value=np.zeros(fan_out, dtype=floatX),
                               name='b',
                               borrow=True)
        # h[0], zero vector
        self.h0 = theano.shared(value=np.zeros(fan_out, dtype=floatX),
                                name='h0',
                                borrow=True)
        # Save all the parameters
        self.params = [self.W, self.U, self.b, self.h0]

        # recurrent function used to compress a sequence of input vectors
        # the first dimension should correspond to time
        def step(x_t, h_tm1):
            h_t = self.act.activate(T.dot(x_t, self.W) + \
                  T.dot(h_tm1, self.U) + self.b)
            return h_t

        # h is the hidden representation over a time sequence
        self.hs, _ = theano.scan(fn=step,
                                 sequences=self.input,
                                 outputs_info=[self.h0],
                                 truncate_gradient=configs.bptt)
        self.h = self.hs[-1]
        # L1, L2 regularization
        self.L1_norm = T.sum(T.abs_(self.W) + T.abs_(self.U))
        self.L2_norm = T.sum(self.W**2) + T.sum(self.U**2)
        # Compress function
        self.compress = theano.function(inputs=[self.input], outputs=self.h)

    @staticmethod
    def save(fname, model):
        '''
		Save current RNN model into fname
		@fname: String. Filename to save the model.
		@model: RNN. An instance of RNN class.
		'''
        with file(fname, 'wb') as fout:
            cPickle.dump(model, fout)

    @staticmethod
    def load(fname):
        '''
		Load an RNN model from fname
		@fname: String. Filename to load the model.
		'''
        with file(fname, 'rb') as fin:
            return cPickle.load(fin)
Пример #19
0
class NNWordBasedAttention(WordBasedAttention):
    """
    Neural Machine Translation By Jointly Learning To Align and Translate
    Dzmitry Bahdanau, KyungHyun Cho, and Yoshua Bengio
    In Proceedings of ICLR 2015
    http://arxiv.org/abs/1409.0473v3
    """
    def __init__(self,
                 word_dim,
                 seq_dim,
                 hidden_dim,
                 activation='tanh',
                 initializer=default_initializer):
        super(NNWordBasedAttention,
              self).__init__(word_dim=word_dim,
                             seq_dim=seq_dim,
                             initializer=default_initializer)
        # (dim, dim)
        self.hidden_dim = hidden_dim
        self.W = shared_rand_matrix((self.word_dim, self.hidden_dim),
                                    'Attention_W', initializer)
        self.U = shared_rand_matrix((self.seq_dim, self.hidden_dim),
                                    'Attention_U', initializer)
        self.v = shared_rand_matrix((self.hidden_dim, ), 'Attention_v',
                                    initializer)
        self.act = Activation(activation)
        self.params = [self.W]
        self.norm_params = [self.W]

    def score(
        self,
        word,
        sequence,
    ):
        """
        :param word: (word_dim, )
        :param sequence: (length, seq_dim)
        :return: score: (length, )
        """
        # (word_dim, ) dot (word_dim, hidden_dim) -> (hidden_dim, )
        hidden1 = T.dot(word, self.W)
        # (length, seq_dim) dot (seq_dim, hidden_dim) -> (length, hidden_dim)
        hidden2 = T.dot(sequence, self.U)
        # (hidden_dim, ) + (length, hidden_dim) -> (length, hidden_dim)
        hidden = hidden1[None, :] + hidden2
        # (length, hidden_dim) -> (length, hidden_dim)
        act_hidden = self.act.activate(hidden)
        # (length, hidden_dim) dot (hidden_dim, ) -> (length, )
        score = T.dot(act_hidden, self.v)
        return score

    def score_batch(
        self,
        word,
        sequence,
    ):
        """
        :param word: (batch, word_dim)
        :param sequence: (batch, length, seq_dim)
        :return: score: (batch, length, )
        """
        # (batch, word_dim) dot (word_dim, hidden_dim) -> (batch, hidden_dim)
        hidden1 = T.dot(word, self.W)
        # (batch, length, seq_dim) dot (seq_dim, hidden_dim) -> (batch, length, hidden_dim)
        hidden2 = T.dot(sequence, self.U)
        # (batch, length, hidden_dim) + (batch, hidden_dim) -> (batch, length, hidden_dim)
        hidden = hidden1[:, None, :] + hidden2
        # (batch, length, hidden_dim) -> (batch, length, hidden_dim)
        act_hidden = self.act.activate(hidden)
        # (batch, length, hidden_dim) dot (hidden_dim, ) -> (batch, length, )
        score = T.dot(act_hidden, self.v)
        return score
Пример #20
0
class BRNNEncoder(object):
    '''
	Bidirectional RNN for sequence encoding. 
	'''
    def __init__(self, config, verbose=True):
        if verbose: logger.debug('Building Bidirectional RNN Encoder...')
        self.input = T.matrix(name='BRNNEncoder_input')
        # Configure Activation function
        self.act = Activation(config.activation)
        # Build Bidirectional RNN
        num_input, num_hidden = config.num_input, config.num_hidden
        self.num_params = 2 * (num_input * num_hidden +
                               num_hidden * num_hidden + num_hidden)
        # Initialize model parameters
        np.random.seed(config.random_seed)
        # 1, Feed-forward matrix for forward direction: W_forward
        W_forward_val = np.random.uniform(low=-1.0,
                                          high=1.0,
                                          size=(num_input, num_hidden))
        W_forward_val = W_forward_val.astype(floatX)
        self.W_forward = theano.shared(value=W_forward_val,
                                       name='W_forward',
                                       borrow=True)
        # 1, Feed-forward matrix for backward direction: W_backward
        W_backward_val = np.random.uniform(low=-1.0,
                                           high=1.0,
                                           size=(num_input, num_hidden))
        W_backward_val = W_backward_val.astype(floatX)
        self.W_backward = theano.shared(value=W_backward_val,
                                        name='W_backward',
                                        borrow=True)
        # 2, Recurrent matrix for forward direction: U_forward
        U_forward_val = np.random.uniform(low=-1.0,
                                          high=1.0,
                                          size=(num_hidden, num_hidden))
        U_forward_val = U_forward_val.astype(floatX)
        U_forward_val, _, _ = np.linalg.svd(U_forward_val)
        self.U_forward = theano.shared(value=U_forward_val,
                                       name='U_forward',
                                       borrow=True)
        # 2, Recurrent matrix for backward direction: U_backward
        U_backward_val = np.random.uniform(low=-1.0,
                                           high=1.0,
                                           size=(num_hidden, num_hidden))
        U_backward_val = U_backward_val.astype(floatX)
        U_backward_val, _, _ = np.linalg.svd(U_backward_val)
        self.U_backward = theano.shared(value=U_backward_val,
                                        name='U_backward',
                                        borrow=True)
        # 3, Bias parameter for the hidden-layer forward direction RNN
        b_forward_val = np.zeros(num_hidden, dtype=floatX)
        self.b_forward = theano.shared(value=b_forward_val,
                                       name='b_forward',
                                       borrow=True)
        # 3, Bias parameter for the hidden-layer backward direction RNN
        b_backward_val = np.zeros(num_hidden, dtype=floatX)
        self.b_backward = theano.shared(value=b_backward_val,
                                        name='b_backward',
                                        borrow=True)
        # h[0], zero vectors, treated as constants
        self.h0_forward = theano.shared(value=np.zeros(num_hidden,
                                                       dtype=floatX),
                                        name='h0_forward',
                                        borrow=True)
        self.h0_backward = theano.shared(value=np.zeros(num_hidden,
                                                        dtype=floatX),
                                         name='h0_backward',
                                         borrow=True)
        # Stack all the parameters
        self.params = [
            self.W_forward, self.W_backward, self.U_forward, self.U_backward,
            self.b_forward, self.b_backward
        ]
        # Compute the forward and backward representation over time
        self.h_forwards, _ = theano.scan(fn=self._forward_step,
                                         sequences=self.input,
                                         outputs_info=[self.h0_forward],
                                         truncate_gradient=config.bptt)
        self.h_backwards, _ = theano.scan(fn=self._backward_step,
                                          sequences=self.input,
                                          outputs_info=[self.h0_backward],
                                          truncate_gradient=config.bptt,
                                          go_backwards=True)
        # Average compressing
        self.h_forward = T.mean(self.h_forwards, axis=0)
        self.h_backward = T.mean(self.h_backwards, axis=0)
        # Concatenate
        self.output = T.concatenate([self.h_forward, self.h_backward], axis=0)
        # L1, L2 regularization
        self.L1_norm = T.sum(
            T.abs_(self.W_forward) + T.abs_(self.W_backward) +
            T.abs_(self.U_forward) + T.abs_(self.U_backward))
        self.L2_norm = T.sum(self.W_forward ** 2) + T.sum(self.W_backward ** 2) + \
              T.sum(self.U_forward ** 2) + T.sum(self.U_backward ** 2)
        if verbose:
            logger.debug(
                'Finished constructing the structure of BRNN Encoder: ')
            logger.debug('Size of the input dimension: %d' % num_input)
            logger.debug('Size of the hidden dimension: %d' % num_hidden)
            logger.debug('Activation function: %s' % config.activation)

    def _forward_step(self, x_t, h_tm1):
        h_t = self.act.activate(T.dot(x_t, self.W_forward) + \
              T.dot(h_tm1, self.U_forward) + \
              self.b_forward)
        return h_t

    def _backward_step(self, x_t, h_tm1):
        h_t = self.act.activate(T.dot(x_t, self.W_backward) + \
              T.dot(h_tm1, self.U_backward) + \
              self.b_backward)
        return h_t

    def encode(self, inputM):
        '''
		@inputM: Theano symbol matrix. Compress the input matrix into output vector.
		'''
        h_forwards, _ = theano.scan(fn=self._forward_step,
                                    sequences=inputM,
                                    outputs_info=[self.h0_forward])
        h_backwards, _ = theano.scan(fn=self._backward_step,
                                     sequences=inputM,
                                     outputs_info=[self.h0_backward],
                                     go_backwards=True)
        # Averaging
        h_forward = T.mean(h_forwards, axis=0)
        h_backward = T.mean(h_backwards, axis=0)
        # Concatenate
        h = T.concatenate([h_forward, h_backward], axis=0)
        return h
Пример #21
0
class GrCNNEncoder(object):
    '''
    (Binary) Gated Recursive Convolutional Neural Network Encoder.
    '''
    def __init__(self, config=None, verbose=True):
        '''
        @config: GRCNNConfiger. Configer used to set the architecture of GRCNNEncoder.
        ''' 
        if verbose: logger.debug('Building Gated Recursive Convolutional Neural Network Encoder...')
        # Scale factor for initializing parameters
        self.scale = config.scale
        # Make theano symbolic tensor for input and model parameters
        self.input = T.matrix(name='GrCNN Encoder input', dtype=floatX)
        # Configure activation function
        self.act = Activation(config.activation)
        fan_in, fan_out = config.num_input, config.num_hidden
        # Initialize model parameters
        # Set seed of the random generator
        np.random.seed(config.random_seed)
        # Projection matrix U
        # Initialize all the matrices using orthogonal matrices        
        U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out))
        U_val = U_val.astype(floatX)
        U_val *= self.scale
        self.U = theano.shared(value=U_val, name='U', borrow=True)
        self.hidden0 = T.dot(self.input, self.U)

        # W^l, W^r, parameters used to construct the central hidden representation
        Wl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out))
        Wl_val = Wl_val.astype(floatX)
        Wl_val, _, _ = np.linalg.svd(Wl_val)
        # Wl_val *= self.scale
        self.Wl = theano.shared(value=Wl_val, name='W_l', borrow=True)

        Wr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, fan_out))
        Wr_val = Wr_val.astype(floatX)
        Wr_val, _, _ = np.linalg.svd(Wr_val)
        # Wr_val *= self.scale
        self.Wr = theano.shared(value=Wr_val, name='W_r', borrow=True)
        
        self.Wb = theano.shared(value=np.zeros(fan_out, dtype=floatX), name='Wb', borrow=True)
        
        # G^l, G^r, parameters used to construct the three-way coefficients
        Gl_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3))
        Gl_val = Gl_val.astype(floatX)
        self.Gl = theano.shared(value=Gl_val, name='G_l', borrow=True)

        Gr_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_out, 3))
        Gr_val = Gr_val.astype(floatX)
        self.Gr = theano.shared(value=Gr_val, name='G_r', borrow=True)

        self.Gb = theano.shared(value=np.zeros(3, dtype=floatX), name='Gb', borrow=True)
        # Save all the parameters into one batch
        self.params = [self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb]
        # Compute the total number of parameters
        self.num_params = reduce(lambda x, y: x+np.prod(y.get_value().shape), self.params, 0)
        # Length of the time sequence
        self.nsteps = self.input.shape[0]
        self.pyramids, _ = theano.scan(fn=self._step_prop, 
                                    sequences=T.arange(self.nsteps-1),
                                    non_sequences=self.nsteps,
                                    outputs_info=[self.hidden0],
                                    n_steps=self.nsteps-1)
        self.output = self.pyramids[-1][0].dimshuffle('x', 0)
        # Compression -- Encoding function
        self.compress = theano.function(inputs=[self.input], outputs=self.output)
        if verbose:
            logger.debug('Finished constructing the structure of grCNN Encoder: ')
            logger.debug('Size of the input dimension: %d' % fan_in)
            logger.debug('Size of the hidden dimension: %d' % fan_out)
            logger.debug('Activation function: %s' % config.activation)

    def _step_prop(self, iter, current_level, nsteps):
        '''
        @current_level: Input matrix at current level. The first dimension corresponds to 
        the timestamp while the second dimension corresponds to the dimension of hidden representation
        '''
        # Build shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the
        # input and output matrix
        left_current_level = current_level[:nsteps-iter-1]
        right_current_level = current_level[1:nsteps-iter]
        # Compute temporary central hidden representation, of size Txd, but we only care about the first
        # T-1 rows, i.e., we only focus on the (T-1)xd sub-matrix.
        central_current_level = self.act.activate(T.dot(left_current_level, self.Wl) + 
                                                  T.dot(right_current_level, self.Wr) + 
                                                  self.Wb)
        # Compute gating function, of size Tx3. Again, due to the internal limitation of Theano.scan, we cannot
        # reduce the size of the matrix and have to keep the same size, but actually we only want the first (T-1)x3
        # sub-matrix.
        current_gates = T.nnet.softmax(T.dot(left_current_level, self.Gl) + 
                                       T.dot(right_current_level, self.Gr) + 
                                       self.Gb)
        left_gate, central_gate, right_gate = current_gates[:, 0], current_gates[:, 1], current_gates[:, 2]
        # Reshape for broadcasting
        left_gate = left_gate.dimshuffle(0, 'x')
        central_gate = central_gate.dimshuffle(0, 'x')
        right_gate = right_gate.dimshuffle(0, 'x')
        # Build next level of hidden representation using soft combination,
        # matrix of size (T-1)xd
        next_level = left_gate * left_current_level + \
                     right_gate * right_current_level + \
                     central_gate * central_current_level
        return T.set_subtensor(current_level[:nsteps-iter-1], next_level)

    def _step_prop_reduce(self, current_level):
        '''
        @current_level: Input matrix at current level. The first dimension corresponds to 
        the timestamp while the second dimension corresponds to the dimension of hidden representation

        Reduced version of level propagation, much more memory and time efficient implementation, but cannot
        be used inside theano.scan because theano.scan requires that the input and output through timestamps should
        have the same shape.
        '''
        # Build shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the
        # input and output matrix
        right_current_level = current_level[1:]
        left_current_level = current_level[:-1]
        # Compute temporary central hidden representation, of size Txd, but we only care about the first
        # T-1 rows, i.e., we only focus on the (T-1)xd sub-matrix.
        central_current_level = self.act.activate(T.dot(left_current_level, self.Wl) + 
                                                  T.dot(right_current_level, self.Wr) + 
                                                  self.Wb)
        # Compute gating function, of size Tx3. Again, due to the internal limitation of Theano.scan, we cannot
        # reduce the size of the matrix and have to keep the same size, but actually we only want the first (T-1)x3
        # sub-matrix.
        current_gates = T.nnet.softmax(T.dot(left_current_level, self.Gl) + 
                                       T.dot(right_current_level, self.Gr) + 
                                       self.Gb)
        left_gate, central_gate, right_gate = current_gates[:, 0], current_gates[:, 1], current_gates[:, 2]
        # Reshape for broadcasting
        left_gate = left_gate.dimshuffle(0, 'x')
        central_gate = central_gate.dimshuffle(0, 'x')
        right_gate = right_gate.dimshuffle(0, 'x')
        # Build next level of hidden representation using soft combination,
        # matrix of size (T-1)xd
        next_level = left_gate * left_current_level + \
                     right_gate * right_current_level + \
                     central_gate * central_current_level
        return next_level

    def encode(self, inputM):
        '''
        @input: Theano symbol matrix. Compress the input matrix into output vector.
        '''
        hidden = T.dot(inputM, self.U)
        # Length of the time sequence
        nsteps = inputM.shape[0]
        pyramids, _ = theano.scan(fn=self._step_prop, 
                                    sequences=T.arange(nsteps-1),
                                    non_sequences=nsteps,
                                    outputs_info=[hidden],
                                    n_steps=nsteps-1)
        output = pyramids[-1][0].dimshuffle('x', 0)
        return output

    def L2_loss(self):
        '''
        Return L2 norm of the model parameters.
        '''
        return T.sum(self.U ** 2) + T.sum(self.Wl ** 2) + T.sum(self.Wr ** 2) + \
               T.sum(self.Gl ** 2) + T.sum(self.Gr ** 2)
Пример #22
0
class ExtGrCNNEncoder(object):
    '''
    An extension of the canonical GrCNN, with more than 1 gate at each local binary window.
    '''
    def __init__(self, config, verbose=True):
        '''
        @config: GrCNNConfiger. Configer used to set the architecture of ExtGrCNNEncoder.
        '''
        if verbose:
            logger.debug(
                'Building Extended Gated Recursive Convolutional Neural Network Encoder...'
            )
        # Scale factor for initializing model parameters
        self.scale = config.scale
        # Make theano symbolic tensor for input and model parameters
        self.input = T.matrix(name='ExtGrCNNEncoder input', dtype=floatX)
        # Configure activation function
        self.act = Activation(config.activation)
        fan_in, fan_out = config.num_input, config.num_hidden
        # Initialize model parameter
        np.random.seed(config.random_seed)
        # Projection matrix U
        U_val = np.random.uniform(low=-1.0, high=1.0, size=(fan_in, fan_out))
        U_val = U_val.astype(floatX)
        U_val *= self.scale
        self.U = theano.shared(value=U_val, name='U', borrow=True)
        self.hidden0 = T.dot(self.input, self.U)
        # 3rd-tensor to implement the multi-gate GrCNN Encoders, where the first dimension corresponds
        # to the number of gates
        Wl_vals = [
            np.random.uniform(low=-1.0, high=1.0,
                              size=(fan_out, fan_out)).astype(floatX)
            for _ in xrange(config.num_gates)
        ]
        Wl_vals = [np.linalg.svd(Wl_val)[0] for Wl_val in Wl_vals]
        Wl_vals = np.asarray(Wl_vals)
        self.Wl = theano.shared(value=Wl_vals, name='W_l', borrow=True)

        Wr_vals = [
            np.random.uniform(low=-1.0, high=1.0,
                              size=(fan_out, fan_out)).astype(floatX)
            for _ in xrange(config.num_gates)
        ]
        Wr_vals = [np.linalg.svd(Wr_val)[0] for Wr_val in Wr_vals]
        Wr_vals = np.asarray(Wr_vals)
        self.Wr = theano.shared(value=Wr_vals, name='W_r', borrow=True)

        self.Wb = theano.shared(value=np.zeros((config.num_gates, fan_out),
                                               dtype=floatX),
                                name='W_b',
                                borrow=True)
        # Multi-gate choosing functions
        Gl_vals = np.random.uniform(low=-1.0,
                                    high=1.0,
                                    size=(fan_out,
                                          config.num_gates + 2)).astype(floatX)
        self.Gl = theano.shared(value=Gl_vals, name='G_l', borrow=True)

        Gr_vals = np.random.uniform(low=-1.0,
                                    high=1.0,
                                    size=(fan_out,
                                          config.num_gates + 2)).astype(floatX)
        self.Gr = theano.shared(value=Gr_vals, name='G_r', borrow=True)

        self.Gb = theano.shared(value=np.zeros(config.num_gates + 2,
                                               dtype=floatX),
                                name='G_b',
                                borrow=True)
        # Stack all the model parameters
        self.params = [
            self.U, self.Wl, self.Wr, self.Wb, self.Gl, self.Gr, self.Gb
        ]
        self.num_params = fan_in * fan_out + 2 * config.num_gates * fan_out * fan_out + config.num_gates * fan_out + \
                          2 * (config.num_gates+2) * fan_out + config.num_gates + 2
        # Length of the time sequence
        self.nsteps = self.input.shape[0]
        # Building ExtGrCNNEncoder pyramids
        self.pyramids, _ = theano.scan(fn=self._step_prop,
                                       sequences=T.arange(self.nsteps - 1),
                                       non_sequences=self.nsteps,
                                       outputs_info=[self.hidden0],
                                       n_steps=self.nsteps - 1)
        self.output = self.pyramids[-1][0].dimshuffle('x', 0)
        # Compression -- Encoding function
        self.compress = theano.function(inputs=[self.input],
                                        outputs=self.output)
        if verbose:
            logger.debug(
                'Finished constructing the structure of ExtGrCNN Encoder: ')
            logger.debug('Size of the input dimension: %d' % fan_in)
            logger.debug('Size of the hidden dimension: %d' % fan_out)
            logger.debug('Number of gating functions: %d' % config.num_gates)
            logger.debug('Number of parameters in ExtGrCNN: %d' %
                         self.num_params)
            logger.debug('Activation function: %s' % config.activation)

    def _step_prop(self, iter, current_level, nsteps):
        '''
        @current_level: Input matrix at current level. The first dimension corresponds to the time dimension 
        while the second dimension corresponds to the dimension of hidden representation
        '''
        # Building shifted matrix, due to the constraints of Theano.scan, we have to keep the shape of the
        # input and output matrix, of size Txd
        left_current_level = current_level[:nsteps - iter - 1]
        right_current_level = current_level[1:nsteps - iter]
        # Compute the temporary central multi-representation, of size TxKxd, where T is the dimension of
        # time, K is the dimension of number of gates and d is the dimension of hidden representation
        multi_centrals = self.act.activate(
            T.dot(left_current_level, self.Wl) +
            T.dot(right_current_level, self.Wr) + self.Wb)
        # Compute the gating function, of size Tx(K+2)
        multi_gates = T.nnet.softmax(
            T.dot(left_current_level, self.Gl) +
            T.dot(right_current_level, self.Gr) + self.Gb)
        # Softmax-Gating combination
        multi_gates = multi_gates.dimshuffle(0, 1, 'x')
        next_level = multi_gates[:, 1:-1, :] * multi_centrals
        next_level = T.sum(next_level, axis=1)
        next_level += multi_gates[:,
                                  0] * left_current_level + multi_gates[:,
                                                                        -1] * right_current_level
        return T.set_subtensor(current_level[:nsteps - iter - 1], next_level)

    def encode(self, inputM):
        '''
        @input: Theano symbolic matrix. Compress the input matrix into output vector. The first dimension
                of inputM should correspond to the time dimension.
        '''
        hidden = T.dot(inputM, self.U)
        nsteps = inputM.shape[0]
        pyramids, _ = theano.scan(fn=self._step_prop,
                                  sequences=T.arange(nsteps - 1),
                                  non_sequences=nsteps,
                                  outputs_info=[hidden],
                                  n_steps=nsteps - 1)
        output = pyramids[-1][0].dimshuffle('x', 0)
        return output