def __init__( self, rng, input, n_in, n_out, W=None, b=None, gamma=None, beta=None, activation_function=T.tanh ): """ Typical hidden layer of a MLP: units are fully-connected and have sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) and the bias vector b is of shape (n_out,). NOTE : The nonlinearity used here is tanh Hidden unit activation is given by: tanh(dot(input,W) + b) :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dmatrix :param input: a symbolic tensor of shape (n_examples, n_in) :type n_in: int :param n_in: dimensionality of input :type n_out: int :param n_out: number of hidden units :type activation_function: theano.Op or function :param activation_function: Non linearity to be applied in the hidden layer """ self.input = input # Note : optimal initialization of weights is dependent on the # activation function used (among other things). # For example, results presented in [Xavier10] suggest that you # should use 4 times larger initial weights for sigmoid compared to tanh # We have no info for other function, so we use the same as tanh. W_values = W if W is None: W_values = numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) if activation_function == theano.tensor.nnet.sigmoid: W_values *= 4 if isinstance(W_values, numpy.ndarray): W_values = theano.shared(value=W_values, name='W', borrow=True) self.W = W_values b_values = b if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) if isinstance(b_values, numpy.ndarray): b_values = theano.shared(value=b_values, name='b', borrow=True) self.b = b_values gamma_val = gamma if gamma is None: gamma_val = numpy.ones((n_out,), dtype=theano.config.floatX) if isinstance(gamma_val, numpy.ndarray): gamma_val = theano.shared(value=gamma_val, name='gamma', borrow=True) self.gamma = gamma_val beta_val = beta if beta is None: beta_val = numpy.zeros((n_out,), dtype=theano.config.floatX) if isinstance(beta_val, numpy.ndarray): beta_val = theano.shared(value=beta_val, name='beta', borrow=True) self.beta = beta_val # linear output lin_output = T.dot(input, self.W) + self.b bn_output = batch_normalization( inputs=lin_output, gamma=self.gamma, beta=self.beta, mean=lin_output.mean(), std=lin_output.std(), mode='high_mem' ) if activation_function is None: self.output = bn_output elif activation_function == T.nnet.relu: self.output = T.clip(bn_output, 0, 20) else: self.output = activation_function(bn_output) # parameters of the model self.params = [self.W, self.b, self.gamma, self.beta]
def __init__(self, rng, is_train, input_data, filter_shape, image_shape, ssample=(1, 1), bordermode='valid', p=0.5, alpha=0.0): """ :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height, filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows, #cols) """ assert image_shape[1] == filter_shape[1] # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) // numpy.prod(ssample)) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX), borrow=True) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0], ), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) gamma_values = numpy.ones((filter_shape[0], ), dtype=theano.config.floatX) self.gamma = theano.shared(value=gamma_values, borrow=True) beta_values = numpy.zeros((filter_shape[0], ), dtype=theano.config.floatX) self.beta = theano.shared(value=beta_values, borrow=True) # convolve input feature maps with filters conv_out = conv2d(input=input_data, filters=self.W, filter_shape=filter_shape, input_shape=image_shape, subsample=ssample, border_mode=bordermode) # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height lin_output = conv_out + self.b.dimshuffle('x', 0, 'x', 'x') bn_output = batch_normalization( inputs=lin_output, gamma=self.gamma.dimshuffle('x', 0, 'x', 'x'), beta=self.beta.dimshuffle('x', 0, 'x', 'x'), mean=lin_output.mean((0, ), keepdims=True), std=lin_output.std((0, ), keepdims=True), mode='low_mem') activated_output = T.nnet.relu(bn_output, alpha=alpha) dropped_output = drop(activated_output, p) self.output = T.switch(T.neq(is_train, 0), dropped_output, p * activated_output) # store parameters of this layer self.params = [self.W, self.b, self.gamma, self.beta] # keep track of model input self.input = input_data
def __init__(self, input, n_in, n_out, W=None, b=None, gamma=None, beta=None): """ Initialize the parameters of the linear regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ self.n_in = n_in self.n_out = n_out # initialize with 0 the weights W as a matrix of shape (n_in, n_out) W_values = W if W is None: W_values = numpy.asarray( numpy.random.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ) self.W = theano.shared(value=W_values, name='W', borrow=True) b_values = b if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, name='b', borrow=True) gamma_val = gamma if gamma is None: gamma_val = numpy.ones((n_out,), dtype=theano.config.floatX) self.gamma = theano.shared(value=gamma_val, name='gamma', borrow=True) beta_val = beta if beta is None: beta_val = numpy.zeros((n_out,), dtype=theano.config.floatX) self.beta = theano.shared(value=beta_val, name='beta', borrow=True) # keep track of model input self.input = input # Linear regression. linear = T.dot(self.input, self.W) + self.b bn_output = batch_normalization( inputs=linear, gamma=self.gamma, beta=self.beta, mean=linear.mean((0,), keepdims=True), std=linear.std((0,), keepdims=True), mode='high_mem' ) # Output of the model self.output = bn_output # parameters of the model self.params = [self.W, self.b, self.gamma, self.beta] self.L1 = T.sum(abs(self.W)) self.L2 = T.sum(self.W ** 2)