def get_square_norm_gradients_scan(D_by_layer, cost, accum = 0):

    # This returns a theano variable that will be of shape (minibatch_size, ).
    # It will contain, for each training example, the associated square-norm of the total gradient.
    # If you take the element-wise square-root afterwards, you will get
    # the associated 2-norms, which is what you want for importance sampling.

    for (layer_name, D) in D_by_layer.items():

        backprop_output = tensor.grad(cost, D['output'])

        if D.has_key('weight'):
            A = D['input']
            B = backprop_output
            S, _ =  theano.scan(fn=lambda A, B: tensor.sqr(tensor.outer(A,B)).sum(),
                                        sequences=[A,B])
            accum = accum + S

        if D.has_key('bias'):

            B = backprop_output
            S, _ =  theano.scan(fn=lambda B: tensor.sqr(B).sum(),
                                        sequences=[B])
            accum = accum + S
        
    return accum
Exemplo n.º 2
0
    def contraction_penalty(self, inputs):
        """
        Calculate (symbolically) the contracting autoencoder penalty term.

        Parameters
        ----------
        inputs : tensor_like or list of tensor_likes
            Theano symbolic (or list thereof) representing the input
            minibatch(es) on which the penalty is calculated. Assumed to be
            2-tensors, with the first dimension indexing training examples and
            the second indexing data dimensions.

        Returns
        -------
        jacobian : tensor_like
            1-dimensional tensor representing, for each mini-batch
            example, the penalty of the encoder transformation.

            Add this to the output of a Cost object, such as
            SquaredError, to penalize it.
        """
        act_grad = self._activation_grad(inputs)
        frob_norm = tensor.dot(tensor.sqr(act_grad), tensor.sqr(self.weights.sum(axis=0)))
        contract_penalty = frob_norm.sum() / inputs.shape[0]
        return contract_penalty
Exemplo n.º 3
0
def orthogonal_penalty(W, D, epsilon=1e-6, axis=1):
    num = T.sqr(T.sum(W * D, axis=axis))                 # n = (d^T w)^2
    den = T.sum(T.sqr(W), axis=axis) * T.sum(T.sqr(D), axis=axis)  # d = ||w||_2^2 * ||d||_2^2
    cos = num / den                                      # c = n / d
    value = cos - (epsilon**2)                           # v = c - epsilon^2
    hinge = value * (value > 0)                          # h = [ v ]_+
    return T.sum(hinge)
def get_mean_square_norm_gradients_variance_method_00(D_by_layer, cost, accum = 0):

    # This returns a theano variable that will be of shape (minibatch_size, ).
    # It will contain, for each training example, the associated mean of the
    # variance wrt the gradient of that minibatch.

    for (layer_name, D) in D_by_layer.items():

        input = D['input']
        input_square_norms = tensor.sqr(D['input']).sum(axis=1)
        backprop_output = tensor.grad(cost, D['output'])
        # I don't think that theano recomputes this.
        # It should be just redundant nodes in the computational graph
        # that end up being computed only once anyways.
        grad_weight = tensor.grad(cost, D['weight'])
        grad_bias = tensor.grad(cost, D['bias'])
        backprop_output_square_norms = tensor.sqr(backprop_output).sum(axis=1)

        if D.has_key('weight'):
            A = input_square_norms * backprop_output_square_norms
            C = tensor.sqr(grad_weight).sum() # all the terms get this "middle" expression added to them
            B = (backprop_output.dot(grad_weight.T) * input).sum(axis=1)

            accum += (A - 2*B + C)

        if D.has_key('bias'):
            # this last `sum` could be a component-wise `max` if we wanted
            # to carry the maximum of the variances instead of the sum of squares
            accum = accum + tensor.sqr(backprop_output - grad_bias.reshape((1,-1))).sum(axis=1)


    return accum
Exemplo n.º 5
0
def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'):
    """
    adadelta update rule, mostly from
    https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
    """
    updates = OrderedDict({})
    exp_sqr_grads = OrderedDict({})
    exp_sqr_ups = OrderedDict({})
    gparams = []
    for param in params:
        empty = numpy.zeros_like(param.get_value())
        exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name)
        gp = T.grad(cost, param)
        exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name)
        gparams.append(gp)
    for param, gp in zip(params, gparams):
        exp_sg = exp_sqr_grads[param]
        exp_su = exp_sqr_ups[param]
        up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
        updates[exp_sg] = up_exp_sg
        step =  -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp
        updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
        stepped_param = param + step
        if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'):
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim))
            scale = desired_norms / (1e-7 + col_norms)
            tmp=stepped_param * scale
            tmp=T.cast(tmp,'float32')
            #print param.type,tmp.type
            updates[param] = tmp
        else:
            updates[param] = stepped_param
            #print param.type,stepped_param.type
    return updates 
Exemplo n.º 6
0
 def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.):
     grads = T.grad(cost, params)
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1e-4
     updates = []
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (
             1. - combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg ** 2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - learning_rate * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * learning_rate * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     return updates
def batchnorm(X, rescale=None, reshift=None, u=None, s=None, e=1e-8):
    """
    batchnorm with support for not using scale and shift parameters
    as well as inference values (u and s) and partial batchnorm (via a)
    will detect and use convolutional or fully connected version
    """
    g = rescale
    b = reshift
    if X.ndim == 4:
        if u is not None and s is not None:
            # use normalization params given a priori
            b_u = u.dimshuffle('x', 0, 'x', 'x')
            b_s = s.dimshuffle('x', 0, 'x', 'x')
        else:
            # compute normalization params from input
            b_u = T.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
            b_s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
        # batch normalize
        X = (X - b_u) / T.sqrt(b_s + e)
        if g is not None and b is not None:
            # apply rescale and reshift
            X = X*T.exp(0.2*g.dimshuffle('x', 0, 'x', 'x')) + b.dimshuffle('x', 0, 'x', 'x')
    elif X.ndim == 2:
        if u is None and s is None:
            # compute normalization params from input
            u = T.mean(X, axis=0)
            s = T.mean(T.sqr(X - u), axis=0)
        # batch normalize
        X = (X - u) / T.sqrt(s + e)
        if g is not None and b is not None:
            # apply rescale and reshift
            X = X*T.exp(0.2*g) + b
    else:
        raise NotImplementedError
    return X
Exemplo n.º 8
0
def get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'):
    """
    adadelta update rule, mostly from
    https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
    """
    print "Generating adadelta updates"
    updates = OrderedDict({})
    exp_sqr_grads = OrderedDict({})
    exp_sqr_ups = OrderedDict({})
    gparams = []
    for param in params:
        exp_sqr_grads[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name)
        gp = T.grad(cost, param)
        exp_sqr_ups[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name)
        gparams.append(gp)
    for param, gp in zip(params, gparams):
        exp_sg = exp_sqr_grads[param]
        exp_su = exp_sqr_ups[param]
        up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
        updates[exp_sg] = up_exp_sg
        step =  -(T.sqrt(exp_su + eps) / T.sqrt(up_exp_sg + eps)) * gp
        updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
        stepped_param = param + step
        # if (param.get_value(borrow=True).ndim == 2) and (param.name != word_vec_name):
        if max_norm and param.name != word_vec_name:
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm))
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param
    return updates
def sgd_updates_adadelta(params, cost, rho=0.95, epsilon=1e-6,
        norm_lim=9, word_vec_name='embedding'):
    updates = OrderedDict({})
    exp_sqr_grads = OrderedDict({})
    exp_sqr_ups = OrderedDict({})
    gparams = [] 
    for param in params:
        empty = np.zeros_like(param.get_value())
        exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name)
        gp = T.grad(cost, param)
        exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name)
        gparams.append(gp)

    for param, gp in zip(params, gparams):
        exp_sg = exp_sqr_grads[param] 
        exp_su = exp_sqr_ups[param]
        up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
        updates[exp_sg] = up_exp_sg
        step =  -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp
        updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
        stepped_param = param + step
        
        if (param.get_value(borrow=True).ndim == 2) and (param.name!='embedding'):
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) 
            desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim)) 
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param
    return updates
Exemplo n.º 10
0
    def _calc_regularization_cost(self):
        """Calculate the regularization cost given the weight decay parameters.

        Only the parameters will be considered that are stored in the set
        self.regularize. We need to handle it manually in this class, because
        the weight matrices contain bias columns, which should not be considered
        in regularization computation. Therefore, do not!!! add W1 and W2 to
        self.regularize

        Returns
        -------
        theano variable
            regularization cost depending on the parameters to be regularized
            and the weight decay parameters for L1 and L2 regularization.
        """
        cost = super(SLmNce, self)._calc_regularization_cost()
        l1_cost = T.sum(T.abs_(self.W1[:, :-1]))
        l1_cost += T.sum(T.abs_(self.W2[:, :-1]))
        l2_cost = T.sum(T.sqr(self.W1[:, :-1]))
        l2_cost += T.sum(T.sqr(self.W2[:, :-1]))

        if self.l1_weight != 0:
            cost += self.l1_weight * l1_cost

        if self.l2_weight != 0:
            cost += self.l2_weight * l2_cost

        return cost
Exemplo n.º 11
0
 def applyConstraint(self, param):
     if param.ndim != 4 and param.ndim != 2:
         warnings.warn("Norm constraints are normally applied to matrices"
                       +" or 4-dimensional tensors, but currently got "
                       +"%d dimensions, please make sure this is the desired"
                       +" parameter to apply norm constraints" % param.ndim)
         
     needFlip = False
     if param.ndim == 4: # a hack for conv layer filters
         prevShape = param.shape
         # conv layer filter shape is (nChannelOut, nChannelIn, r, c)
         param = param.flatten(2)
         # now it is (nout, nin), which is different from (nin, nout) 
         # from fulling connected networks, so need to flip here
         needFlip = True
     
     if needFlip:
         col_norm = T.sqrt(T.sum(T.sqr(param), axis=1, keepdims=True))
     else:
         col_norm = T.sqrt(T.sum(T.sqr(param), axis=0, keepdims=True))
         
     param /= (col_norm+1e-7)
     param *= self.norm
     
     if needFlip:
         param = param.reshape(prevShape)
                     
     return param
Exemplo n.º 12
0
    def get_updates_adadelta(grads,params,decay=0.95):
        decay = constantX(decay)
        print 'build updates with adadelta'
        for param, grad in zip(params, grads):
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(numpy.zeros(param.get_value().shape, dtype=floatX))
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(numpy.zeros(param.get_value().shape, dtype=floatX))
            if param.name is not None:
                mean_square_grad.name = 'mean_square_grad_' + param.name
                mean_square_dx.name = 'mean_square_dx_' + param.name

            # Accumulate gradient
            new_mean_squared_grad = \
                    decay * mean_square_grad +\
                    (1. - decay) * T.sqr(grad)
            # Compute update
            epsilon = constantX(1e-7)
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = - rms_dx_tm1 / rms_grad_t * grad

            # Accumulate updates
            new_mean_square_dx = \
                    decay * mean_square_dx + \
                    (1. - decay) * T.sqr(delta_x_t)

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t
Exemplo n.º 13
0
def AdadeltaUpdate(params,cost,stepSize=1.0,rho=0.95,epsilon=1e-6,norm_lim=9):
    updates=OrderedDict({})
    exp_sqr_grads=OrderedDict({})
    exp_sqr_update=OrderedDict({})
    g_params=[]
    for param in params:
        empty=np.zeros_like(param.get_value())
        exp_sqr_grads[param]=theano.shared(value=as_floatX(empty),name='exp_grad_%s'%param.name)
        exp_sqr_update[param]=theano.shared(value=as_floatX(empty),name='exp_grad_%s'%param.name)
        gp=T.grad(cost,param)
        g_params.append(gp)
    for param,gp in zip(params,g_params):
        exp_sg=exp_sqr_grads[param]
        exp_su=exp_sqr_update[param]
        update_exp_sg=rho*exp_sg+(1-rho)*T.sqr(gp)#????
        updates[exp_sg]=update_exp_sg
        
        step=-(T.sqrt(exp_su+epsilon)/T.sqrt(update_exp_sg+epsilon))*gp
        stepped_param=param+step*stepSize
        
        update_exp_su=rho*exp_su+(1-rho)*T.sqr(step)
        updates[exp_su]=update_exp_su

        if param.get_value(borrow=True).ndim==2 and param.name!='wordVec':
            col_norms=T.sqrt(T.sum(T.sqr(stepped_param),axis=0))
            desired_norms=T.clip(col_norms,0,T.sqrt(norm_lim))#???
            scale=desired_norms/(1e-7+col_norms)
            updates[param]=stepped_param*scale
        else:
            updates[param]=stepped_param
    return updates
Exemplo n.º 14
0
    def get_regs(self, states_0_, states, M):
        """
        Additional regularization terms.

        """
        regs = 0

        if self.L1_Wrec > 0:
            W = self.params['Wrec']
            regs += self.L1_Wrec * tensor.mean(abs(W))

        if self.L2_Wrec > 0:
            W = self.params['Wrec']
            regs += self.L2_Wrec * tensor.mean(tensor.sqr(W))

        #---------------------------------------------------------------------------------
        # Firing rates
        #---------------------------------------------------------------------------------

        if self.L2_r > 0:
            baseline = 0.

            M_ = (tensor.tile(M.T, (states.shape[-1], 1, 1))).T
            states_all = tensor.concatenate(
                [states_0_.reshape((1, states_0_.shape[0], states_0_.shape[1])), states],
                axis=0
                )
            r = self.f_hidden(states_all)
            regs += self.L2_r * tensor.sum(tensor.sqr(r - baseline)*M_)/tensor.sum(M_)

        #---------------------------------------------------------------------------------

        return regs
Exemplo n.º 15
0
def build_cost_functional_L2norm_w_reg(lambda_val,h,y_sym,Thetas):
	""" 
	build_cost_functional_L2norm (with regularization) J=J_y(Theta,b) # J\equiv J_y(\Theta,b), 
	for the L2 norm, or Euclidean space norm, but now with 
	regularization

	INPUT/PARAMETERS
	================
	@type y_sym  : theano symbolic matrix, such as T.matrix() or theano shared variable
	@param y_sym : output data as a symbolic theano variable or theano shared variable
NOTE: y_sym = T.matrix(); # this could be a vector, but I can keep y to be "general" in size dimensions
	
	@type h     : theano shared variable of size dims. (K,m) (size dim. might be (m,K) due to right action
	@param h    : hypothesis

	@type Thetas : tuple, list, or (ordered) iterable of Theta's as theano shared variables, of length L
	@params Thetas : weights or parameters thetas for all the layers l=1,2,...L-1
	NOTE: remember, we want a list of theano MATRICES, themselves, not the class

	RETURN/OUTPUTS
	==============
	@type J_theta : theano symbolic expression (computational graph)

	"""
	J_theta = np.cast[theano.config.floatX](0.5) * T.mean(T.sqr(h-y_sym))

	# T.sqr is element-wise operation (take the square of each element), and so it's an automorphism
	reg_term = T.mean( [ T.sum( T.sqr(Theta), acc_dtype=theano.config.floatX) for Theta in Thetas], acc_dtype=theano.config.floatX )
	reg_term = np.cast[theano.config.floatX](lambda_val/ (2.))*reg_term

	J_theta = J_theta + reg_term
	return J_theta
Exemplo n.º 16
0
 def cost(self):
   """
   :rtype: (theano.Variable | None, dict[theano.Variable,theano.Variable] | None)
   :returns: cost, known_grads
   """
   known_grads = None
   if self.loss == 'ce' or self.loss == 'priori':
     if self.attrs.get("target", "").endswith("[sparse:coo]"):
       assert isinstance(self.y, tuple)
       assert len(self.y) == 3
       from NativeOp import crossentropy_softmax_and_gradient_z_sparse
       y_mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")]
       ce, grad_z = crossentropy_softmax_and_gradient_z_sparse(
         self.z, self.index, self.y[0], self.y[1], self.y[2], y_mask)
       return self.norm * T.sum(ce), {self.z: grad_z}
     if self.y_data_flat.type == T.ivector().type:
       # Use crossentropy_softmax_1hot to have a more stable and more optimized gradient calculation.
       # Theano fails to use it automatically; I guess our self.i indexing is too confusing.
       #idx = self.index.flatten().dimshuffle(0,'x').repeat(self.y_m.shape[1],axis=1) # faster than line below
       #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m * idx, y_idx=self.y_data_flat * self.index.flatten())
       nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i])
       #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat)
       #nll = -T.log(T.nnet.softmax(self.y_m)[self.i,self.y_data_flat[self.i]])
       #z_c = T.exp(self.z[:,self.y])
       #nll = -T.log(z_c / T.sum(z_c,axis=2,keepdims=True))
       #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat)
       #nll = T.set_subtensor(nll[self.j], T.constant(0.0))
     else:
       nll = -T.dot(T.log(T.clip(self.p_y_given_x[self.i], 1.e-38, 1.e20)), self.y_data_flat[self.i].T)
     return self.norm * T.sum(nll), known_grads
   elif self.loss == 'entropy':
     h_e = T.exp(self.y_m) #(TB)
     pcx = T.clip((h_e / T.sum(h_e, axis=1, keepdims=True)).reshape((self.index.shape[0],self.index.shape[1],self.attrs['n_out'])), 1.e-6, 1.e6) # TBD
     ee = -T.sum(pcx[self.i] * T.log(pcx[self.i])) # TB
     #nll, pcxs = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y[self.i])
     nll, _ = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) # TB
     ce = nll.reshape(self.index.shape) * self.index # TB
     y = self.y_data_flat.reshape(self.index.shape) * self.index # TB
     f = T.any(T.gt(y,0), axis=0) # B
     return T.sum(f * T.sum(ce, axis=0) + (1-f) * T.sum(ee, axis=0)), known_grads
     #return T.sum(T.switch(T.gt(T.sum(y,axis=0),0), T.sum(ce, axis=0), -T.sum(ee, axis=0))), known_grads
     #return T.switch(T.gt(T.sum(self.y_m[self.i]),0), T.sum(nll), -T.sum(pcx * T.log(pcx))), known_grads
   elif self.loss == 'priori':
     pcx = self.p_y_given_x[self.i, self.y_data_flat[self.i]]
     pcx = T.clip(pcx, 1.e-38, 1.e20)  # For pcx near zero, the gradient will likely explode.
     return -T.sum(T.log(pcx)), known_grads
   elif self.loss == 'sse':
     if self.y_data_flat.dtype.startswith('int'):
       y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim=1), 'int32')
       y_oh = T.eq(T.shape_padleft(T.arange(self.attrs['n_out']), y_f.ndim), T.shape_padright(y_f, 1))
       return T.mean(T.sqr(self.p_y_given_x[self.i] - y_oh[self.i])), known_grads
     else:
       #return T.sum(T.sum(T.sqr(self.y_m - self.y.reshape(self.y_m.shape)), axis=1)[self.i]), known_grads
       return T.sum(T.sqr(self.y_m[self.i] - self.y_data_flat.reshape(self.y_m.shape)[self.i])), known_grads
       #return T.sum(T.sum(T.sqr(self.z - (self.y.reshape((self.index.shape[0], self.index.shape[1], self.attrs['n_out']))[:self.z.shape[0]])), axis=2).flatten()[self.i]), known_grads
       #y_z = T.set_subtensor(T.zeros((self.index.shape[0],self.index.shape[1],self.attrs['n_out']), dtype='float32')[:self.z.shape[0]], self.z).flatten()
       #return T.sum(T.sqr(y_z[self.i] - self.y[self.i])), known_grads
       #return T.sum(T.sqr(self.y_m - self.y[:self.z.shape[0]*self.index.shape[1]]).flatten()[self.i]), known_grads
   else:
     assert False, "unknown loss: %s" % self.loss
Exemplo n.º 17
0
Arquivo: ext.py Projeto: Beronx86/cle
    def exe(self, mainloop):
        """
        .. todo::

            WRITEME
        """
        for k, p in mainloop.updates.items():
            for key in self.keys:
                if key in str(k):
                    token = 1

                    for waiver in self.waivers:
                        if waiver in str(k):
                            token = 0

                    if token:
                        updated_param = mainloop.updates[k]

                        if self.is_vector:
                            col_norms = T.sqrt(T.sqr(updated_param).sum(axis=0))
                            desired_norms = T.clip(col_norms, 0, self.weight_norm)
                            ratio = (desired_norms / (1e-7 + col_norms))
                            mainloop.updates[k] = updated_param * ratio
                        else:
                            norm = T.sqrt(T.sqr(updated_param).sum())
                            desired_norm = T.clip(norm, 0, self.weight_norm)
                            ratio = (desired_norm / (1e-7 + norm))
                            mainloop.updates[k] = updated_param * ratio
Exemplo n.º 18
0
    def mcmc(ll, *frvs):
        full_observations = dict(observations)
        full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, frvs)]))
        
        loglik = -full_log_likelihood(full_observations)

        proposals = free_RVs_prop
        H = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + loglik

# -- this should be an inner loop
        g = []
        g.append(tensor.grad(loglik, frvs))
        
        proposals = [(p - epsilon*gg[0]/2.) for p, gg in zip(proposals, g)]

        rvsp = [(rvs + epsilon*rvp) for rvs,rvp in zip(frvs, proposals)]
        
        full_observations = dict(observations)
        full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, rvsp)]))
        new_loglik = -full_log_likelihood(full_observations)
        
        gnew = []
        gnew.append(tensor.grad(new_loglik, rvsp))
        proposals = [(p - epsilon*gn[0]/2.) for p, gn in zip(proposals, gnew)]
# --
        
        Hnew = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + new_loglik

        dH = Hnew - H
        accept = tensor.or_(dH < 0., U < tensor.exp(-dH))

        return [tensor.switch(accept, -new_loglik, ll)] + \
            [tensor.switch(accept, p, f) for p, f in zip(rvsp, frvs)], \
            {}, theano.scan_module.until(accept)
Exemplo n.º 19
0
    def free_energy(self, V):
        """
        .. todo::

            WRITEME
        """
        V_name = 'V' if V.name is None else V.name

        assert V.ndim == 2

        bias_term = T.dot(V,self.bias_vis)
        bias_term.name = 'bias_term'
        assert len(bias_term.type.broadcastable) == 1

        sq_term = 0.5 * T.sqr(V).sum(axis=1)
        sq_term.name = 'sq_term'
        assert len(sq_term.type.broadcastable) == 1

        softplus_term =  T.nnet.softplus( (self.transformer.lmul(V)+self.bias_hid) / T.sqr(self.sigma)).sum(axis=1)
        assert len(softplus_term.type.broadcastable) == 1
        softplus_term.name = 'softplus_term'

        return (
                sq_term
                - bias_term
                ) / T.sqr(self.sigma) - softplus_term
Exemplo n.º 20
0
    def learning_updates(self):
        # This code computes updates only for given R, so it drops last dimension. Plus soe theano magic to circumvent its graph comp.
        grads = self.grads
        for i, param in enumerate(self.params):

            mean_square_grad = theano.shared(
                np.zeros_like(param.get_value(), dtype=theano.config.floatX), name=param.name + str(self.network.R)+'_msg')

            mean_square_dx = theano.shared(
                np.zeros_like(param.get_value(), dtype=theano.config.floatX), name=param.name + str(self.network.R)+'_dx')


            # Accumulate gradient
            new_mean_squared_grad = (
                self.decay * mean_square_grad +
                (1 - self.decay) * T.sqr(grads[i])
            )

            # Compute update
            epsilon = self.lr
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = - (rms_dx_tm1 / rms_grad_t) * grads[i]

            # Accumulate updates
            new_mean_square_dx = (
                self.decay * mean_square_dx +
                (1 - self.decay) * T.sqr(delta_x_t)
            )

            # Apply update
            yield mean_square_grad, T.cast(new_mean_squared_grad, dtype=theano.config.floatX)
            yield mean_square_dx, T.cast(new_mean_square_dx, dtype=theano.config.floatX)
            yield param,  param + 2*T.cast(delta_x_t, dtype=theano.config.floatX)
Exemplo n.º 21
0
    def _step(self, x_tm1, u_tm1, inputs, x_prior, u_prior, *args):
        # x_prior are previous states
        # u_prior are causes from above
        outputs = self.activation(T.dot(x_tm1, self.W))
        rec_error = T.sqr(inputs - outputs).sum()
        causes = (1 + T.exp(-T.dot(u_tm1, self.V))) * .5

        if self.pool_flag:
            batch_size = inputs.shape[0]
            dim = causes.shape[1]
            imgs = T.cast(T.sqrt(dim), 'int64')
            causes_up = causes.reshape(
                (batch_size, 1, imgs, imgs)).repeat(
                    self.pool_size, axis=2).repeat(self.pool_size,
                                                   axis=3).flatten(ndim=2)
        else:
            causes_up = causes

        x = _IstaStep(rec_error, x_tm1, lambdav=self.gamma*causes_up,
                      x_prior=x_prior)

        if self.pool_flag:
            dim = T.cast(T.sqrt(x.shape[1]), 'int64')
            x_pool = x.reshape((batch_size, 1, dim, dim))
            x_pool = max_pool_2d(x_pool, ds=(self.pool_size, )*2).flatten(ndim=2)
        else:
            x_pool = x

        prev_u_cost = .01 * self.gamma * T.sqr(u_tm1-u_prior).sum()
        u_cost = causes * abs(x_pool) * self.gamma + prev_u_cost
        u = _IstaStep(u_cost.sum(), u_tm1, lambdav=self.gamma)
        causes = (1 + T.exp(-T.dot(u, self.V))) * .5
        u_cost = causes * abs(x_pool) * self.gamma

        return (x, u, u_cost, outputs)
Exemplo n.º 22
0
    def __call__(self, model, X, Y):

        batch_size = 32
        image_size = 96

        Y_hat = model.fprop(X)

        print "Warning: the size of the axe is set manually"
        Yx_hat = Y_hat[:, :image_size]
        Yy_hat = Y_hat[:, image_size:]

        Yx = Y[:, :image_size]
        Yy = Y[:, image_size:]

        epsylon = 1e-10

        costMatrix = T.matrix()
        max_x = T.argmax(Yx, axis=1)
        max_y = T.argmax(Yy, axis=1)

        costMatrix = T.sqr(
            T.log((Yx + epsylon) / (Yx[range(batch_size), max_x] + epsylon)[:, None])
            - T.log((Yx_hat + epsylon) / (Yx_hat[range(batch_size), max_x] + epsylon)[:, None])
        )
        costMatrix += T.sqr(
            T.log((Yy + epsylon) / (Yy[range(batch_size), max_y] + epsylon)[:, None])
            - T.log((Yy_hat + epsylon) / (Yy_hat[range(batch_size), max_y] + epsylon)[:, None])
        )

        costMatrix *= T.neq(T.sum(Y, axis=1), 0)[:, None]

        cost = costMatrix.sum(axis=1).mean()
        return cost
Exemplo n.º 23
0
  def get_layer_monitoring_channels(self,state_below=None,state=None,target=None):
    rval=OrderedDict()
    W,=self.transformer.get_params()
    rval['norm']=T.sqrt(T.sqr(W).sum())
    if(target is not None) and ((state_below is not None) or (state is not None)):
        if state is None:
            state=self.fprop(state_below)
        target=1.-target  #0/1 dissim/sim to 1/0 distances
        rmse=T.sqrt(T.mean(T.sqr(state-target)))
        rval['rmse']=rmse.mean()
        if self.costfn=='margin':
            thresh=self.costparam
        elif self.costfn=='cauchy':
            thresh=2./(1.+T.exp(self.costparam))
        else:
            thresh=0.5
        yhat=state<thresh
        y=target<0.5
        wrong_bit=T.cast(T.neq(y,yhat),state.dtype)
        rval['01_loss']=wrong_bit.mean()

        y=T.cast(y,state.dtype)
        yhat=T.cast(yhat,state.dtype)
        tp=(y*yhat).sum()
        fp=((1-y)*yhat).sum()
        prec=compute_precision(tp,fp)
        rec=compute_recall(y,tp)
        f1=compute_f1(prec,rec)
        rval['neg_precision']=-prec
        rval['neg_recall']=-rec
        rval['neg_f1']=-f1
        return rval
Exemplo n.º 24
0
    def get_updates(self, grads):
        grads = OrderedDict(grads)
        updates = OrderedDict()

        for param in grads.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = theano.shared(theano._asarray(
                param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_grad_' + param.name, borrow=False)
            self.parameters.append(mean_square_grad)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = theano.shared(theano._asarray(
                param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_dx_' + param.name, borrow=False)
            self.parameters.append(mean_square_dx)

            # Accumulate gradient
            new_mean_squared_grad = self.decay * mean_square_grad + \
                (1 - self.decay) * T.sqr(grads[param])

            # Compute update
            rms_dx_tm1 = T.sqrt(mean_square_dx + self.epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + self.epsilon)
            delta_x_t = - rms_dx_tm1 / rms_grad_t * grads[param]

            # Accumulate updates
            new_mean_square_dx = self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t)

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

        return updates
Exemplo n.º 25
0
def entropy_exp(X, g=None, b=None, u=None, s=None, a=1., e=1e-8):
    if X.ndim == 4:
        if u is not None and s is not None:
            b_u = u.dimshuffle('x', 0, 'x', 'x')
            b_s = s.dimshuffle('x', 0, 'x', 'x')
        else:
            b_u = T.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
            b_s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
        if a != 1:
            b_u = (1. - a)*0. + a*b_u
            b_s = (1. - a)*1. + a*b_s
        X = (X - b_u) / T.sqrt(b_s + e)
        if g is not None and b is not None:
            X = X*T.exp(g.dimshuffle('x', 0, 'x', 'x'))+b.dimshuffle('x', 0, 'x', 'x')
    elif X.ndim == 2:
        if u is None and s is None:
            u = T.mean(X, axis=0)
            s = T.mean(T.sqr(X - u), axis=0)
        if a != 1:
            u = (1. - a)*0. + a*u
            s = (1. - a)*1. + a*s
        X = (X - u) / T.sqrt(s + e)
        if g is not None and b is not None:
            X = X*T.exp(g)+b
    else:
        raise NotImplementedError
    return X
Exemplo n.º 26
0
def batchnorm(X, g=None, b=None, u=None, s=None, a=1., e=1e-8):
    """
    batchnorm with support for not using scale and shift parameters
    as well as inference values (u and s) and partial batchnorm (via a)
    will detect and use convolutional or fully connected version
    """
    if X.ndim == 4:
        if u is not None and s is not None:
            b_u = u.dimshuffle('x', 0, 'x', 'x')
            b_s = s.dimshuffle('x', 0, 'x', 'x')
        else:
            b_u = tensor.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
            b_s = tensor.mean(tensor.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
        if a != 1:
            b_u = (1. - a)*0. + a*b_u
            b_s = (1. - a)*1. + a*b_s
        X = (X - b_u) / tensor.sqrt(b_s + e)
        if g is not None and b is not None:
            X = X*g.dimshuffle('x', 0, 'x', 'x') + b.dimshuffle('x', 0, 'x', 'x')
    elif X.ndim == 2:
        if u is None and s is None:
            u = tensor.mean(X, axis=0)
            s = tensor.mean(tensor.sqr(X - u), axis=0)
        if a != 1:
            u = (1. - a)*0. + a*u
            s = (1. - a)*1. + a*s
        X = (X - u) / tensor.sqrt(s + e)
        if g is not None and b is not None:
            X = X*g + b
    else:
        raise NotImplementedError
    return X
def create_adam_updates(updates, params, gparams, gsums, xsums, lr, eps, beta1, beta2):
    i = theano.shared(np.float64(0.0).astype(theano.config.floatX))
    i_t = i + 1.0
    omb1_t = 1.0 - beta1**i_t
    omb2_t = 1.0 - beta2**i_t
    lr_t = lr * (T.sqrt(omb2_t) / omb1_t)
    for p, g, m, v in zip(params, gparams, gsums, xsums):
        if is_subtensor_op(p):
            origin, indexes = get_subtensor_op_inputs(p)
            m_sub = m[indexes]
            v_sub = v[indexes]
            m_t = beta1*m_sub + (1.0-beta1)*g
            v_t = beta2*v_sub + (1.0-beta2)*T.sqr(g)
            g_t = m_t / (T.sqrt(v_t) + eps)
            updates[m] = T.set_subtensor(m_sub, m_t)
            updates[v] = T.set_subtensor(v_sub, v_t)
            updates[origin] = T.inc_subtensor(p, -lr_t*g_t)
        else:
            m_t = beta1*m + (1.0-beta1)*g
            v_t = beta2*v + (1.0-beta2)*T.sqr(g)
            g_t = m_t / (T.sqrt(v_t) + eps)
            updates[m] = m_t
            updates[v] = v_t
            updates[p] = p - lr_t*g_t
    updates[i] = i_t
Exemplo n.º 28
0
def cosine_similarity(y_true, y_pred):
    norm_y_true = T.sqrt(T.sum(T.sqr(y_true), 1, keepdims=True))
    norm_y_pred = T.sqrt(T.sum(T.sqr(y_pred), 1, keepdims=True))
    dot = T.tensordot(y_true, y_pred, axes=[1,1])
    cossim = dot / (norm_y_true * norm_y_pred)
    objective = 1-cossim
    return objective.mean(axis=-1)
Exemplo n.º 29
0
def mse(output, target, mean_over_second=True):
    """
    This is the Mean Square Error (MSE) across all dimensions, or per multibatch row (depending on mean_over_second).

    Parameters
    ----------
    output : tensor
        The symbolic tensor (or compatible) output from the network. (Comes from model).
    target : tensor
        The symbolic tensor (or compatible) target truth to compare the output against. (Comes from data).
    mean_over_second : bool
        Boolean whether or not to take the mean across all dimensions (True) or just the
        feature dimensions (False)

    Returns
    -------
    number
        The appropriate mean square error.
    """
    # The following definition came from the Conditional_nade project
    if mean_over_second:
        cost = T.mean(T.sqr(target - output))
    else:
        cost = T.mean(T.sqr(target - output).sum(axis=1))
    return cost
Exemplo n.º 30
0
	def initialise(self):
		if self.X.ndim == 4:
			if self.u is not None and self.s is not None:
				b_u = self.u.dimshuffle('x',0,'x','x')
				b_s = self.s.dimshuffle('x',0,'x','x')
			else:
				b_u = T.mean(self.X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
				b_s = T.mean(T.sqr(self.X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
			if self.a != 1:
				b_u = (1. - self.a)*0. + self.a*b_u
				b_s = (1. - self.a)*1. + self.a*b_s
			output = (self.X - b_u) / T.sqrt(b_s + self.e)
			if self.g is not None and self.b is not None:
				self.X = self.X*self.g.dimshuffle('x', 0, 'x', 'x') + self.b.dimshuffle('x', 0, 'x', 'x')
				self.params.append(g);self.params.append(b)
		elif self.X.ndim == 2:
			if self.u is None and self.s is None:
				self.u = T.mean(self.X, axis=0)
				self.s = T.mean(T.sqr(self.X - self.u), axis=0)
			if self.a != 1:
				self.u = (1. - self.a)*0. + self.a*self.u
				self.s = (1. - self.a)*1. + self.a*self.s
		 	self.X = (self.X - self.u) / T.sqrt(self.s + self.e)
			if self.g is not None and self.b is not None:
				self.X = self.X*self.g + self.b
				self.params.append(g);self.params.append(b)
		else:
			raise NotImplementedError
Exemplo n.º 31
0
    def __init__(self,
                 mu=0.5,
                 learning_rate=0.1,
                 n_epochs=40,
                 dataset='mnist.pkl.gz',
                 nkerns=[20, 50],
                 batch_size=500,
                 lam_contractive=0,
                 lam_l2=0.01,
                 temperature=1):
        """ Demonstrates lenet on MNIST dataset
    
        :type learning_rate: float
        :param learning_rate: learning rate used (factor for the stochastic
                              gradient)
    
        :type n_epochs: int
        :param n_epochs: maximal number of epochs to run the optimizer
    
        :type dataset: string
        :param dataset: path to the dataset used for training /testing (MNIST here)
    
        :type nkerns: list of ints
        :param nkerns: number of kernels on each layer
        """

        self.mu = mu
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.nkerns = nkerns
        self.batch_size = batch_size
        self.train_batch_size = batch_size
        self.datasets = load_data(dataset)
        self.train_set_x, self.train_set_y = self.datasets[0]
        self.valid_set_x, self.valid_set_y = self.datasets[1]
        self.test_set_x, self.test_set_y = self.datasets[2]

        # compute number of minibatchs for train, valid, test
        self.n_train_batches = self.train_set_x.get_value(borrow=True).shape[0]
        self.n_train_batches //= batch_size
        self.n_valid_batches = self.valid_set_x.get_value(borrow=True).shape[0]
        self.n_valid_batches //= batch_size
        self.n_test_batches = self.test_set_x.get_value(borrow=True).shape[0]
        self.n_test_batches //= batch_size

        # allocate symbolic variables for the data
        self.index = T.lscalar()  # index to a minibatch

        # start-snippet-1
        x = T.matrix('x')
        y = T.ivector('y')

        # BUILD ACTUAL MODEL
        print('... building the model')

        # Reshape matrix of rasterized image of shape(batch_size, 28* 28)  to 4D tensor
        layer0_input = x.reshape((self.train_batch_size, 1, 28, 28))

        # Construct the first convolutional pooling layer:
        # Filtering reduces the image size to(28-5+1, 28-5+1) = (24, 24)
        # maxpooling reduces this further to ( 24/2, 24/2) = (12, 12)
        # 4D output tensor is thus of shape (1, nkerns[0],12,12)
        self.rng = numpy.random.RandomState(23455)
        self.layer0 = LeNetConvPoolLayer(self.rng,
                                         input=layer0_input,
                                         image_shape=(self.train_batch_size, 1,
                                                      28, 28),
                                         filter_shape=(nkerns[0], 1, 5, 5),
                                         poolsize=(2, 2))

        layer1_input = self.layer0.output
        layer1_input_flatten = self.layer0.output.flatten(2)
        # Construct the second convolutional pooling layer
        # Filtering reduces the image size to (12-5+1, 12-5+1) = (8,8)
        # maxpooling reduces this further to (8.2, 8/2) = (4,4)
        # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
        self.layer1 = LeNetConvPoolLayer(self.rng, input=layer1_input, image_shape=(self.train_batch_size, nkerns[0], 12, 12), \
                                    filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2))

        # The FC layer. It operates on 2D matrices of shapece (batch_size, volumndepty*num_pixels). This will
        # generate a matrix of shape (batch_size, nkerns[1] * 4 * 4).
        # ????Hidden layer units happen to equal to minibatch?????
        layer2_input = self.layer1.output.flatten(2)
        self.layer2 = FCLayer(self.rng,
                              input=layer2_input,
                              n_in=nkerns[1] * 4 * 4,
                              n_out=500,
                              activation=T.tanh)

        # classify the values of the fully-connected sigmoidal layer
        layer3_input = self.layer2.output
        self.layer3 = FCSoftMaxLayer(input=layer3_input,
                                     n_in=500,
                                     n_out=10,
                                     rng=self.rng,
                                     temperature=temperature)

        self.params = self.layer3.params + self.layer2.params + self.layer1.params + self.layer0.params
        L2params = [self.layer3.W, self.layer2.W, self.layer1.W, self.layer0.W]
        self.params_shape = self.layer3.params_shape + self.layer2.params_shape + self.layer1.params_shape + self.layer0.params_shape
        velocities = self.layer3.velocity + self.layer2.velocity + self.layer1.velocity + self.layer0.velocity

        paramssum = T.sum(T.sqr(L2params[0]))
        for i in range(1, len(L2params)):
            paramssum += T.sum(T.sqr(L2params[i]))

        y_score_given_x = self.layer3.p_y_given_x

        #layer3 to x contractive
        layer3_Jnorm1, _ = theano.scan(
            lambda ind, yi, y_score_given_x, x:
            (theano.gradient.jacobian(y_score_given_x[ind, yi], x))[ind, :],
            sequences=[T.arange(self.train_batch_size), y],
            non_sequences=[y_score_given_x, x])
        cost = self.layer3.negative_log_likelihood(y)

        testnorm = T.sum(
            (theano.gradient.jacobian(y_score_given_x[0],
                                      layer3_input)[:, 0, :])**2)**0.5
        testgrads = T.grad(T.sum(testnorm), self.params)

        grads = T.grad(cost, self.params)

        # momentum update
        updates = [
            (param_i, param_i - learning_rate * grad_i + mu * v_i)
            for param_i, grad_i, v_i in zip(self.params, grads, velocities)
        ]

        updates += [(v_i, mu * v_i - learning_rate * grad_i)
                    for grad_i, v_i in zip(grads, velocities)]

        # create a function to compute the mistakes that are made by the model
        self.validate_p = theano.function(
            [self.index], [testnorm] + testgrads,
            givens={
                x:
                self.valid_set_x[self.index *
                                 self.train_batch_size:(self.index + 1) *
                                 self.train_batch_size]
            })

        self.test_model = theano.function(
            [self.index],
            self.layer3.errors(y),
            givens={
                x:
                self.test_set_x[self.index *
                                self.train_batch_size:(self.index + 1) *
                                self.train_batch_size],
                y:
                self.test_set_y[self.index *
                                self.train_batch_size:(self.index + 1) *
                                self.train_batch_size]
            })

        self.validate_model = theano.function(
            [self.index],
            self.layer3.errors(y),
            givens={
                x:
                self.valid_set_x[self.index *
                                 self.train_batch_size:(self.index + 1) *
                                 self.train_batch_size],
                y:
                self.valid_set_y[self.index *
                                 self.train_batch_size:(self.index + 1) *
                                 self.train_batch_size]
            })

        self.train_model = theano.function(
            [self.index],
            cost,
            updates=updates,
            givens={
                x:
                self.train_set_x[self.index *
                                 self.train_batch_size:(self.index + 1) *
                                 self.train_batch_size],
                y:
                self.train_set_y[self.index *
                                 self.train_batch_size:(self.index + 1) *
                                 self.train_batch_size]
            })

        self.test_confidencefunc = theano.function(
            [self.index],
            self.layer3.confidence_mean(y),
            givens={
                x:
                self.test_set_x[self.index *
                                self.train_batch_size:(self.index + 1) *
                                self.train_batch_size],
                y:
                self.test_set_y[self.index *
                                self.train_batch_size:(self.index + 1) *
                                self.train_batch_size]
            })
    def __init__(self,
                 numpy_rng=numpy.random.RandomState(2**30),
                 theano_rng=None,
                 n_ins=601,
                 n_outs=259,
                 l1_reg=None,
                 l2_reg=None,
                 hidden_layers_sizes=[512, 512, 512, 512, 512, 512, 512],
                 n_speakers_accent=2,
                 hidden_activation='tanh',
                 output_activation='linear'):

        print "DNN MULTI-SPEAKER INITIALISATION"

        self.sigmoid_layers = []
        self.params = []
        self.delta_params = []
        self.n_layers = len(hidden_layers_sizes)

        self.n_ins = n_ins
        self.n_outs = n_outs

        self.output_activation = output_activation

        self.l1_reg = l1_reg
        self.l2_reg = l2_reg

        self.final_layer_accent = []
        self.error_cost = []

        #finetune_cost = []
        #self.finetune_costs_accent = []

        self.errors_accent = []

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy.random.randint(2**30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')
        self.y = T.matrix('y')

        for i in xrange(self.n_layers):

            if i == 0:

                input_size = n_ins
            else:

                input_size = hidden_layers_sizes[i - 1]

            if i == 0:

                layer_input = self.x
            else:

                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.tanh)

            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params)
            self.delta_params.extend(sigmoid_layer.delta_params)

        ####Final Layer for speaker

        if self.output_activation == 'linear':
            self.final_layer_accent = LinearLayer(
                rng=numpy_rng,
                input=self.sigmoid_layers[-1].output,
                n_in=hidden_layers_sizes[-1],
                n_out=n_outs)

        elif self.output_activation == 'sigmoid':
            self.final_layer_accent = SigmoidLayer(
                rng=numpy_rng,
                input=self.sigmoid_layers[-1].output,
                n_in=hidden_layers_sizes[-1],
                n_out=n_outs,
                activation=T.nnet.sigmoid)
        else:
            print(
                "This output activation function: %s is not supported right now!"
                % (self.output_activation))
            sys.exit(1)

        self.params.extend(self.final_layer_accent.params)
        self.delta_params.extend(self.final_layer_accent.delta_params)

        ##MSE FOR EACH SPEAKER
        self.error_cost = T.mean(
            T.sum((self.final_layer_accent.output - self.y) *
                  (self.final_layer_accent.output - self.y),
                  axis=1))

        ###L1-norm
        if self.l1_reg is not None:
            for i in xrange(self.n_layers):
                W = self.params[i * 2]
                self.error_cost += self.l1_reg * (abs(W).sum())

        ###L2-norm
        if self.l2_reg is not None:
            for i in xrange(self.n_layers):
                W = self.params[i * 2]
                self.error_cost += self.l2_reg * T.sqr(W).sum()
    def _setup_functions(self, trX):
        l1_e = (10, trX.shape[1], 5, 5)
        print("l1_e", l1_e)
        l1_d = (l1_e[1], l1_e[0], l1_e[2], l1_e[3])
        print("l1_d", l1_d)
        l2_e = (20, l1_e[0], 5, 5)
        print("l2_e", l2_e)
        l2_d = (l2_e[1], l2_e[0], l2_e[2], l2_e[3])
        print("l2_d", l2_d)
        # 2 layers means downsample by 2 ** 2 -> 4, with input size 28x28 -> 7x7
        # assume square
        self.downpool_sz_h = trX.shape[-2] / 4
        self.downpool_sz_w = trX.shape[-1] / 4
        # self.downpool_sz_h = int(np.ceil(trX.shape[-2] / 4.))
        # self.downpool_sz_w = int(np.ceil(trX.shape[-1] / 4.))
        l3_e = (l2_e[0] * self.downpool_sz_h * self.downpool_sz_w,
                self.n_hidden)
        print("l3_e", l3_e)
        l3_d = (l3_e[1], l3_e[0])
        print("l4_d", l3_d)
        sys.stdout.flush()

        if not hasattr(self, "params"):
            print('generating weights')
            sys.stdout.flush()
            we = uniform(l1_e)
            w2e = uniform(l2_e)
            w3e = uniform(l3_e)
            b3e = shared0s(self.n_hidden)
            wmu = uniform((self.n_hidden, self.n_code))
            bmu = shared0s(self.n_code)
            wsigma = uniform((self.n_hidden, self.n_code))
            bsigma = shared0s(self.n_code)

            wd = uniform((self.n_code, self.n_hidden))
            bd = shared0s((self.n_hidden))
            w2d = uniform(l3_d)
            b2d = shared0s((l3_d[1]))
            w3d = uniform(l2_d)
            wo = uniform(l1_d)
            self.enc_params = [we, w2e, w3e, b3e, wmu, bmu, wsigma, bsigma]
            self.dec_params = [wd, bd, w2d, b2d, w3d, wo]
            self.params = self.enc_params + self.dec_params

        print('theano code')
        sys.stdout.flush()

        X = T.tensor4()
        e = T.matrix()
        Z_in = T.matrix()
        Z_in_1 = T.matrix()
        Z_in_2 = T.matrix()

        # encode_mu, encode_sigm = self._conv_gaussian_enc(X, *self.enc_params) #EHA
        # h2 = self._get_h2(X, *self.enc_params)
        code_mu, code_log_sigma, Z, y = self._model(X, e)

        # out_h = self._get_deconv_dec(Z_in, *self.dec_params)
        y_out = self._deconv_dec(Z_in, *self.dec_params)
        y_out_1 = self._deconv_dec(Z_in_1, *self.dec_params)
        y_out_2 = self._deconv_dec(Z_in_2, *self.dec_params)

        #rec_cost = T.sum(T.abs_(X - y))
        rec_cost = T.sum(T.sqr(X - y)) # / T.cast(X.shape[0], 'float32')
        prior_cost = log_prior(code_mu, code_log_sigma)

        cost = rec_cost - prior_cost

        print('getting updates')
        sys.stdout.flush()

        updates = Adam(self.params, cost)

        print('compiling')
        sys.stdout.flush()
        # self._encode = theano.function([X], (encode_mu, encode_sigm)) #EHA
        # self._hidden2 = theano.function([X], h2)
        # self._get_out_h = theano.function([Z_in], out_h)
        self._fit_function = theano.function([X, e], cost, updates=updates)
        self._reconstruct = theano.function([X, e], y)
        self._x_given_z = theano.function([Z_in], y_out)
        self._z_given_x = theano.function([X], (code_mu, code_log_sigma))
        self._2x_given_2z = theano.function([Z_in_1, Z_in_2], (y_out_1, y_out_2))
Exemplo n.º 34
0
def mlp_synthetic(X_train,
                  X_test,
                  y_train,
                  y_test,
                  precision,
                  vy,
                  hWidths,
                  mini_batchsize=10,
                  epochs=1000,
                  display=False):
    input_size = X_train.shape[1]
    output_size = y_train.shape[1]
    X = T.fmatrix(name='X')
    Y = T.fmatrix(name='Y')
    rng = numpy.random.RandomState(123)
    dim = find_dim_theta(hWidths, input_size, output_size)

    input_size = X_train.shape[1]

    initial_params = theano.shared(
        floatX(rng.randn(1, dim).astype(theano.config.floatX)))
    params = initial_params
    op = model(X, params, hWidths, input_size, output_size)

    cost = T.sum(T.sqr(op - Y)) * (vy * 0.5) + T.sum(
        T.sqr(params)) * (precision * 0.5)
    updates = sgd(cost, params, lr=0.000001)
    # updates=Adam(cost,params)
    train = theano.function(inputs=[X, Y],
                            outputs=cost,
                            updates=updates,
                            allow_input_downcast=True,
                            name='train')
    predict = theano.function(inputs=[X],
                              outputs=op,
                              allow_input_downcast=True)
    fcost = theano.function(inputs=[op, Y],
                            outputs=cost,
                            allow_input_downcast=True)

    test_costs = []
    train_costs = []

    for i in range(epochs):
        for start, end in zip(
                range(0, len(X_train), mini_batchsize),
                range(mini_batchsize, len(X_train), mini_batchsize)):
            yd = (floatX(y_train[start:end])).reshape(mini_batchsize, 1)
            cost_v = train(X_train[start:end], yd)

        # Done this cost prediction needs to change
        # fin_cost_test = fcost(predict(X_test), floatX(y_test).reshape(len(y_test), 1))
        # fin_cost_train = fcost(predict(X_train), floatX(y_train).reshape(len(y_train), 1))
        fin_cost_test = MSE(predict(X_test), y_test)
        fin_cost_train = MSE(predict(X_train), y_train)
        test_costs.append(fin_cost_test)
        train_costs.append(fin_cost_train)
        # print i, fin_cost_test, fin_cost_train

    final_params = params.get_value()
    # print final_params
    print type(final_params)
    print final_params.shape
    # print 'final b_o values'
    # print b_o.get_value()

    # fin_cost_test = fcost(predict(X_test), floatX(y_test).reshape(len(y_test), 1))
    # fin_cost_train = fcost(predict(X_train), floatX(y_train).reshape(len(y_train), 1))
    fin_cost_test = MSE(predict(X_test), y_test)
    fin_cost_train = MSE(predict(X_train), y_train)
    print 'vy: {}, prec: {}, Train: {}, Test: {}'.format(
        vy, precision, fin_cost_train, fin_cost_test)

    # Calculate RMS error with simple mean prediction
    test_mean = np.mean(y_test)
    train_mean = np.mean(y_train)

    mean_p_test = np.ones(y_test.size) * test_mean
    mean_p_train = np.ones(y_train.size) * train_mean

    # test_cost=fcost(floatX(mean_p_test).reshape(len(y_test), 1), floatX(y_test).reshape(len(y_test), 1))
    # train_cost=fcost(floatX(mean_p_train).reshape(len(y_train), 1), floatX(y_train).reshape(len(y_train), 1))
    mean_pred_test_cost = MSE(mean_p_test, y_test)
    mean_pred_train_cost = MSE(mean_p_train, y_train)

    tArray = np.ones(epochs) * mean_pred_test_cost
    if (display):
        print 'MSE for mean prediction, Train:{} ,Test:{}'.format(
            mean_pred_train_cost, mean_pred_test_cost)

        plt.plot(range(epochs), test_costs, label='Test')
        plt.plot(range(epochs), train_costs, label='Train')
        # plt.plot(range(epochs), tArray, label='Reference',color='black',linewidth=1.6)
        plt.xlabel('Epochs')
        plt.ylabel('Error')
        plt.legend()
        # plt.title('TrainCost:{}, TestCost: {}, Ref: {}'.format(fin_cost_train, fin_cost_test, mean_pred_test_cost))

    return fin_cost_train, fin_cost_test, final_params
Exemplo n.º 35
0
 def log_likelihood_samplesImean_sigma(self, samples, mean, sigma):
     return -log2pi*T.cast(samples.shape[1], floatX) / 2 -                \
            T.sum(T.sqr((samples-mean)/sigma) + 2*T.log(sigma), axis=1) / 2
Exemplo n.º 36
0
 def log_likelihood_samples(self, samples):
     '''Given samples as rows of a matrix, returns their log-likelihood under the zero mean unit covariance Gaussian as a vector'''
     return -log2pi * T.cast(samples.shape[1], floatX) / 2 - T.sum(
         T.sqr(samples), axis=1) / 2
Exemplo n.º 37
0
    def __init__(self, config_path, dnn):
        """
        Initializate class give either a filename or a model
        Usually this method will load a model from disk and store internally,
        but model can also be provided directly instead (useful when training)
        """
        config_module = imp.load_source('config', config_path)
        self.cfg = config_module.cfg
        self.weights_fname = str(config_path)[:-3] + '.npz'
        self.model = config_module.get_model(dnn=dnn)

        # Load weights
        print('(inside init of IAN)')
        print('Loading weights')
        params = list(set(lasagne.layers.get_all_params(self.model['l_out'],trainable=True)+\
                 lasagne.layers.get_all_params(self.model['l_discrim'],trainable=True)+\
                 [x for x in lasagne.layers.get_all_params(self.model['l_out'])+\
                 lasagne.layers.get_all_params(self.model['l_discrim'])\
                 if x.name[-4:]=='mean' or x.name[-7:]=='inv_std']))
        print('params = {}'.format(params))
        GANcheckpoints.load_weights(self.weights_fname, params)

        # Shuffle weights if using IAF with MADE
        if 'l_IAF_mu' in self.model:
            print('Shuffling MADE masks')
            self.model['l_IAF_mu'].reset("Once")
            self.model['l_IAF_ls'].reset("Once")

        print('Compiling Theano Functions')
        # Input Tensor
        self.X = T.TensorType('float32', [False] * 4)('X')

        # Latent Vector
        self.Z = T.TensorType('float32', [False] * 2)('Z')

        # X_hat(Z)
        self.X_hat = lasagne.layers.get_output(self.model['l_out'],
                                               {self.model['l_Z']: self.Z},
                                               deterministic=True)
        print('self.X_hat = {}'.format(self.X_hat))
        self.X_hat_fn = theano.function([self.Z], self.X_hat)

        # Z_hat(X)
        self.Z_hat = lasagne.layers.get_output(self.model['l_Z'],
                                               {self.model['l_in']: self.X},
                                               deterministic=True)
        print('self.Z_hat = {}'.format(self.Z_hat))
        self.Z_hat_fn = theano.function([self.X], self.Z_hat)

        # Imgrad Functions
        r1, r2 = T.scalar('r1', dtype='int32'), T.scalar('r2', dtype='int32')
        c1, c2 = T.scalar('c', dtype='int32'), T.scalar('c2', dtype='int32')
        RGB = T.tensor4('RGB', dtype='float32')

        # Image Gradient Function, evaluates the change in latents which would lighten the image in the local area
        self.calculate_lighten_gradient = theano.function(
            [c1, r1, c2, r2, self.Z],
            T.grad(T.mean(self.X_hat[0, :, r1:r2, c1:c2]), self.Z))

        # Image Color Gradient Function, evaluates the change in latents which would push the image towards the local desired RGB value
        # Consider changing this to only take in a smaller RGB array, rather than a full-sized, indexed RGB array.
        # Also consider using the L1 loss instead of L2
        self.calculate_RGB_gradient = theano.function(
            [c1, r1, c2, r2, RGB, self.Z],
            T.grad(
                T.mean((T.sqr(-self.X_hat[0, :, r1:r2, c1:c2] +
                              RGB[0, :, r1:r2, c1:c2]))),
                self.Z))  # may need a T.mean
Exemplo n.º 38
0
def normal_lcdf(mu, sigma, x):
    """Compute the log of the cumulative density function of the normal."""
    z = (x - mu) / sigma
    return tt.switch(tt.lt(z, -1.0),
                     tt.log(tt.erfcx(-z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2.,
                     tt.log1p(-tt.erfc(z / tt.sqrt(2.)) / 2.))
Exemplo n.º 39
0
def main():
    # Parameters
    task = 'cifar10'
    name = '0'

    begin_save = 0
    input_nc = 3
    loss_type = ['trickLogD', 'minimax', 'ls']
    nloss = 3
    shuffle_ = True
    batchSize = 32
    fineSize = 32
    flip = True

    ncandi = 1  # # of survived childern
    kD = 3  # # of discrim updates for each gen update
    kG = 1  # # of discrim updates for each gen update
    ntf = 256
    b1 = 0.5  # momentum term of adam
    nz = 100  # # of dim for Z
    ngf = 128  # # of gen filters in first conv layer
    ndf = 128  # # of discrim filters in first conv layer
    niter = 100  # # of iter at starting learning rate
    lr = 0.0002  # initial learning rate for adam G
    lrd = 0.0002  # initial learning rate for adam D
    beta = 0.002  # hyperparameter of fitness function
    GP_norm = False  # wheather apply gradients penatly on discriminator
    LAMBDA = 2.  # hyperparameter of GP term

    save_freq = 1000
    show_freq = 1000

    # Check if cifar data exists
    if not os.path.exists("./cifar-10-batches-py"):
        print(
            "CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'."
        )
        return

    # Load the dataset
    print("Loading data...")
    data = load_data()
    X_train = data['X_train']

    ################## MODEL D #######################
    print("Building model and compiling functions...")
    # Prepare Theano variables for inputs and targets
    real_imgs = T.tensor4('real_imgs')
    fake_imgs = T.tensor4('fake_imgs')
    # Create neural network model
    discriminator = models_uncond.build_discriminator_32(ndf=ndf)
    # Create expression for passing real data through the discriminator
    real_out = lasagne.layers.get_output(discriminator, real_imgs)
    # Create expression for passing fake data through the discriminator
    fake_out = lasagne.layers.get_output(discriminator, fake_imgs)
    # Create loss expressions
    discriminator_loss = (
        lasagne.objectives.binary_crossentropy(real_out, 1) +
        lasagne.objectives.binary_crossentropy(fake_out, 0)).mean()

    # Gradients penalty norm
    if GP_norm is True:
        alpha = t_rng.uniform((batchSize, 1, 1, 1), low=0., high=1.)
        differences = fake_imgs - real_imgs
        interpolates = real_imgs + (alpha * differences)
        gradients = theano.grad(lasagne.layers.get_output(
            discriminator, interpolates).sum(),
                                wrt=interpolates)
        slopes = T.sqrt(T.sum(T.sqr(gradients), axis=(1, 2, 3)))
        gradient_penalty = T.mean((slopes - 1.)**2)

        D_loss = discriminator_loss + LAMBDA * gradient_penalty
        b1_d = 0.
    else:
        D_loss = discriminator_loss
        b1_d = b1

    # Create update expressions for training
    discriminator_params = lasagne.layers.get_all_params(discriminator,
                                                         trainable=True)
    lrtd = theano.shared(lasagne.utils.floatX(lrd))
    updates_d = lasagne.updates.adam(D_loss,
                                     discriminator_params,
                                     learning_rate=lrtd,
                                     beta1=b1_d)
    lrt = theano.shared(lasagne.utils.floatX(lr))

    # Diversity fitnees
    Fd = theano.gradient.grad(discriminator_loss, discriminator_params)
    Fd_score = beta * T.log(sum(T.sum(T.sqr(x)) for x in Fd))

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_d = theano.function([real_imgs, fake_imgs],
                              discriminator_loss,
                              updates=updates_d)

    # Compile another function generating some data
    disft_fn = theano.function([real_imgs, fake_imgs],
                               [(real_out).mean(),
                                (fake_out).mean(), Fd_score])

    # Launch the training loop.
    print("Starting training...")
    desc = task + '_' + name
    print desc

    if not os.path.isdir('logs'):
        os.mkdir(os.path.join('logs'))
    f_log = open('logs/%s.ndjson' % desc, 'wb')
    if not os.path.isdir('samples'):
        os.mkdir(os.path.join('samples/'))
    if not os.path.isdir('samples/' + desc):
        os.mkdir(os.path.join('samples/', desc))
    if not os.path.isdir('models'):
        os.mkdir(os.path.join('models/'))
    if not os.path.isdir('models/' + desc):
        os.mkdir(os.path.join('models/', desc))

    gen_new_params = []
    n_updates = 0

    # We iterate over epochs:
    for epoch in range(niter):
        if shuffle_ is True:
            X_train = shuffle(X_train)
        for xmb in iter_data(X_train, size=batchSize * kD):
            # For measure fitness score
            sample_xmb = floatX(X_train[np_rng.randint(0, 50000, ncandi *
                                                       ntf), :, :, :])

            # initial G cluster
            if epoch + n_updates == 0:
                for can_i in range(0, ncandi):
                    train_g_, gen_fn_, generator_ = create_G(
                        loss_type=loss_type[can_i % nloss],
                        discriminator=discriminator,
                        lr=lr,
                        b1=b1,
                        ngf=ngf)

                    for _ in range(0, kG):
                        zmb = floatX(
                            np_rng.uniform(-1., 1., size=(batchSize, nz)))
                        cost = train_g_(zmb)

                    sample_zmb = floatX(np_rng.uniform(-1., 1.,
                                                       size=(ntf, nz)))
                    gen_imgs = gen_fn_(sample_zmb)

                    gen_new_params.append(
                        lasagne.layers.get_all_param_values(generator_))

                    if can_i == 0:
                        g_imgs_old = gen_imgs
                        fmb = gen_imgs[0:batchSize / ncandi * kD, :, :, :]
                    else:
                        g_imgs_old = np.append(g_imgs_old, gen_imgs, axis=0)
                        fmb = np.append(fmb,
                                        gen_imgs[0:batchSize / ncandi *
                                                 kD, :, :, :],
                                        axis=0)

                ######## MODEL G ########
                noise = T.matrix('noise')
                generator = models_uncond.build_generator_32(noise, ngf=ngf)
                Tgimgs = lasagne.layers.get_output(generator)
                Tfake_out = lasagne.layers.get_output(discriminator, Tgimgs)

                g_loss_logD = lasagne.objectives.binary_crossentropy(
                    Tfake_out, 1).mean()
                g_loss_minimax = -lasagne.objectives.binary_crossentropy(
                    Tfake_out, 0).mean()
                g_loss_ls = T.mean(T.sqr((Tfake_out - 1)))

                g_params = lasagne.layers.get_all_params(generator,
                                                         trainable=True)

                up_g_logD = lasagne.updates.adam(g_loss_logD,
                                                 g_params,
                                                 learning_rate=lrt,
                                                 beta1=b1)
                up_g_minimax = lasagne.updates.adam(g_loss_minimax,
                                                    g_params,
                                                    learning_rate=lrt,
                                                    beta1=b1)
                up_g_ls = lasagne.updates.adam(g_loss_ls,
                                               g_params,
                                               learning_rate=lrt,
                                               beta1=b1)

                train_g_logD = theano.function([noise],
                                               g_loss_logD,
                                               updates=up_g_logD)
                train_g_minimax = theano.function([noise],
                                                  g_loss_minimax,
                                                  updates=up_g_minimax)
                train_g_ls = theano.function([noise],
                                             g_loss_ls,
                                             updates=up_g_ls)

                gen_fn = theano.function([noise],
                                         lasagne.layers.get_output(
                                             generator, deterministic=True))
            else:
                gen_old_params = gen_new_params
                for can_i in range(0, ncandi):
                    for type_i in range(0, nloss):
                        lasagne.layers.set_all_param_values(
                            generator, gen_old_params[can_i])
                        if loss_type[type_i] == 'trickLogD':
                            for _ in range(0, kG):
                                zmb = floatX(
                                    np_rng.uniform(-1.,
                                                   1.,
                                                   size=(batchSize, nz)))
                                cost = train_g_logD(zmb)
                        elif loss_type[type_i] == 'minimax':
                            for _ in range(0, kG):
                                zmb = floatX(
                                    np_rng.uniform(-1.,
                                                   1.,
                                                   size=(batchSize, nz)))
                                cost = train_g_minimax(zmb)
                        elif loss_type[type_i] == 'ls':
                            for _ in range(0, kG):
                                zmb = floatX(
                                    np_rng.uniform(-1.,
                                                   1.,
                                                   size=(batchSize, nz)))
                                cost = train_g_ls(zmb)

                        sample_zmb = floatX(
                            np_rng.uniform(-1., 1., size=(ntf, nz)))
                        gen_imgs = gen_fn(sample_zmb)
                        _, fr_score, fd_score = disft_fn(
                            sample_xmb[0:ntf], gen_imgs)
                        fit = fr_score - fd_score

                        if can_i * nloss + type_i < ncandi:
                            idx = can_i * nloss + type_i
                            gen_new_params[
                                idx] = lasagne.layers.get_all_param_values(
                                    generator)
                            fitness[idx] = fit
                            fake_rate[idx] = fr_score
                            g_imgs_old[idx * ntf:(idx + 1) *
                                       ntf, :, :, :] = gen_imgs
                            fmb[idx*batchSize/ncandi*kD:(idx+1)*batchSize/ncandi*kD,:,:,:] = \
                                gen_imgs[0:batchSize/ncandi*kD,:,:,:]
                        else:
                            fit_com = fitness - fit
                            if min(fit_com) < 0:
                                ids_replace = np.where(fit_com == min(fit_com))
                                idr = ids_replace[0][0]
                                fitness[idr] = fit
                                fake_rate[idr] = fr_score

                                gen_new_params[
                                    idr] = lasagne.layers.get_all_param_values(
                                        generator)

                                g_imgs_old[idr * ntf:(idr + 1) *
                                           ntf, :, :, :] = gen_imgs
                                fmb[idr*batchSize/ncandi*kD:(idr+1)*batchSize/ncandi*kD,:,:,:] = \
                                    gen_imgs[0:batchSize/ncandi*kD,:,:,:]

                print fake_rate, fitness
                f_log.write(
                    str(fake_rate) + ' ' + str(fd_score) + ' ' + str(fitness) +
                    '\n')

            # train D
            for xreal, xfake in iter_data(xmb, shuffle(fmb), size=batchSize):
                cost = train_d(xreal, xfake)

            for i in range(0, ncandi):
                xfake = g_imgs_old[i * ntf:(i + 1) * ntf, :, :, :]
                xreal = sample_xmb[i * ntf:(i + 1) * ntf, :, :, :]
                tr, fr, fd = disft_fn(xreal, xfake)
                if i == 0:
                    fake_rate = np.array([fr])
                    fitness = np.array([0.])
                    real_rate = np.array([tr])
                    FDL = np.array([fd])
                else:
                    fake_rate = np.append(fake_rate, fr)
                    fitness = np.append(fitness, [0.])
                    real_rate = np.append(real_rate, tr)
                    FDL = np.append(FDL, fd)
            print fake_rate, FDL
            print(n_updates, epoch, real_rate.mean())
            f_log.write(
                str(fake_rate) + ' ' + str(FDL) + '\n' + str(epoch) + ' ' +
                str(n_updates) + ' ' + str(real_rate.mean()) + '\n')
            f_log.flush()

            if n_updates % show_freq == 0:
                blank_image = Image.new("RGB",
                                        (fineSize * 8 + 9, fineSize * 8 + 9))
                for i in range(8):
                    for ii in range(8):
                        img = g_imgs_old[i * 8 + ii, :, :, :]
                        img = ImgRescale(img,
                                         center=True,
                                         scale=True,
                                         convert_back=True)
                        blank_image.paste(
                            Image.fromarray(img),
                            (ii * fineSize + ii + 1, i * fineSize + i + 1))
                blank_image.save('samples/%s/%s_%d.png' %
                                 (desc, desc, n_updates / save_freq))

            if n_updates % save_freq == 0 and n_updates > begin_save - 1:
                # Optionally, you could now dump the network weights to a file like this:
                np.savez(
                    'models/%s/gen_%d.npz' % (desc, n_updates / save_freq),
                    *lasagne.layers.get_all_param_values(generator))
                np.savez(
                    'models/%s/dis_%d.npz' % (desc, n_updates / save_freq),
                    *lasagne.layers.get_all_param_values(discriminator))

            n_updates += 1
Exemplo n.º 40
0
def normal_lccdf(mu, sigma, x):
    z = (x - mu) / sigma
    return tt.switch(tt.gt(z, 1.0),
                     tt.log(tt.erfcx(z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2.,
                     tt.log1p(-tt.erfc(-z / tt.sqrt(2.)) / 2.))
Exemplo n.º 41
0
 def fn(images):
     return T.sum(T.sqr(images2neibs(images, (2, 2), mode='valid')),
                  axis=[0, 1])
Exemplo n.º 42
0
def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7):
    """Max weight norm constraints and gradient clipping

    This takes a TensorVariable and rescales it so that incoming weight
    norms are below a specified constraint value. Vectors violating the
    constraint are rescaled so that they are within the allowed range.

    Parameters
    ----------
    tensor_var : TensorVariable
        Theano expression for update, gradient, or other quantity.
    max_norm : scalar
        This value sets the maximum allowed value of any norm in
        `tensor_var`.
    norm_axes : sequence (list or tuple)
        The axes over which to compute the norm.  This overrides the
        default norm axes defined for the number of dimensions
        in `tensor_var`. When this is not specified and `tensor_var` is a
        matrix (2D), this is set to `(0,)`. If `tensor_var` is a 3D, 4D or
        5D tensor, it is set to a tuple listing all axes but axis 0. The
        former default is useful for working with dense layers, the latter
        is useful for 1D, 2D and 3D convolutional layers.
        (Optional)
    epsilon : scalar, optional
        Value used to prevent numerical instability when dividing by
        very small or zero norms.

    Returns
    -------
    TensorVariable
        Input `tensor_var` with rescaling applied to weight vectors
        that violate the specified constraints.

    Examples
    --------
    >>> param = theano.shared(
    ...     np.random.randn(100, 200).astype(theano.config.floatX))
    >>> update = param + 100
    >>> update = norm_constraint(update, 10)
    >>> func = theano.function([], [], updates=[(param, update)])
    >>> # Apply constrained update
    >>> _ = func()
    >>> from lasagne.utils import compute_norms
    >>> norms = compute_norms(param.get_value())
    >>> np.isclose(np.max(norms), 10)
    True

    Notes
    -----
    When `norm_axes` is not specified, the axes over which the norm is
    computed depend on the dimensionality of the input variable. If it is
    2D, it is assumed to come from a dense layer, and the norm is computed
    over axis 0. If it is 3D, 4D or 5D, it is assumed to come from a
    convolutional layer and the norm is computed over all trailing axes
    beyond axis 0. For other uses, you should explicitly specify the axes
    over which to compute the norm using `norm_axes`.
    """
    ndim = tensor_var.ndim

    if norm_axes is not None:
        sum_over = tuple(norm_axes)
    elif ndim == 2:  # DenseLayer
        sum_over = (0, )
    elif ndim in [3, 4, 5]:  # Conv{1,2,3}DLayer
        sum_over = tuple(range(1, ndim))
    else:
        raise ValueError("Unsupported tensor dimensionality {}."
                         "Must specify `norm_axes`".format(ndim))

    dtype = np.dtype(theano.config.floatX).type
    norms = T.sqrt(T.sum(T.sqr(tensor_var), axis=sum_over, keepdims=True))
    target_norms = T.clip(norms, 0, dtype(max_norm))
    constrained_output = \
        (tensor_var * (target_norms / (dtype(epsilon) + norms)))

    return constrained_output
Exemplo n.º 43
0
def log_likelihood(tgt, mu, ls):
    return T.sum(-(np.float32(0.5 * np.log(2 * np.pi)) + ls)
            - 0.5 * T.sqr(tgt - mu) / T.exp(2 * ls))
Exemplo n.º 44
0
    def __init__(self,
                 data,
                 U,
                 img_h=160,
                 img_w=300,
                 hidden_size=100,
                 batch_size=50,
                 lr=0.001,
                 lr_decay=0.95,
                 sqr_norm_lim=9,
                 fine_tune_W=True,
                 fine_tune_M=False,
                 optimizer='adam',
                 filter_sizes=[3, 4, 5],
                 num_filters=100,
                 conv_attn=False,
                 encoder='rnn',
                 elemwise_sum=True,
                 corr_penalty=0.0,
                 xcov_penalty=0.0,
                 n_recurrent_layers=1,
                 is_bidirectional=False):
        self.data = data
        self.img_h = img_h
        self.batch_size = batch_size
        self.fine_tune_W = fine_tune_W
        self.fine_tune_M = fine_tune_M
        self.lr = lr
        self.lr_decay = lr_decay
        self.optimizer = optimizer
        self.sqr_norm_lim = sqr_norm_lim
        self.conv_attn = conv_attn

        index = T.iscalar()
        c = T.imatrix('c')
        r = T.imatrix('r')
        y = T.ivector('y')
        c_mask = T.fmatrix('c_mask')
        r_mask = T.fmatrix('r_mask')
        c_seqlen = T.ivector('c_seqlen')
        r_seqlen = T.ivector('r_seqlen')
        embeddings = theano.shared(U, name='embeddings', borrow=True)
        zero_vec_tensor = T.fvector()
        self.zero_vec = np.zeros(img_w, dtype=theano.config.floatX)
        self.set_zero = theano.function([zero_vec_tensor],
                                        updates=[(embeddings,
                                                  T.set_subtensor(
                                                      embeddings[0, :],
                                                      zero_vec_tensor))])
        if encoder.find('cnn') > -1 and (
                encoder.find('rnn') > -1
                or encoder.find('lstm') > -1) and not elemwise_sum:
            self.M = theano.shared(np.eye(2 * hidden_size).astype(
                theano.config.floatX),
                                   borrow=True)
        else:
            self.M = theano.shared(np.eye(hidden_size).astype(
                theano.config.floatX),
                                   borrow=True)

        c_input = embeddings[c.flatten()].reshape(
            (c.shape[0], c.shape[1], embeddings.shape[1]))
        r_input = embeddings[r.flatten()].reshape(
            (r.shape[0], r.shape[1], embeddings.shape[1]))

        l_in = lasagne.layers.InputLayer(shape=(batch_size, img_h, img_w))

        if encoder.find('cnn') > -1:
            l_conv_in = lasagne.layers.ReshapeLayer(l_in,
                                                    shape=(batch_size, 1,
                                                           img_h, img_w))
            conv_layers = []
            for filter_size in filter_sizes:
                conv_layer = lasagne.layers.Conv2DLayer(
                    l_conv_in,
                    num_filters=num_filters,
                    filter_size=(filter_size, img_w),
                    stride=(1, 1),
                    nonlinearity=lasagne.nonlinearities.rectify,
                    border_mode='valid')
                pool_layer = lasagne.layers.MaxPool2DLayer(
                    conv_layer, pool_size=(img_h - filter_size + 1, 1))
                conv_layers.append(pool_layer)

            l_conv = lasagne.layers.ConcatLayer(conv_layers)
            l_conv = lasagne.layers.DenseLayer(
                l_conv,
                num_units=hidden_size,
                nonlinearity=lasagne.nonlinearities.tanh)

        if is_bidirectional:
            if encoder.find('lstm') > -1:
                prev_fwd, prev_bck = l_in, l_in
                for _ in xrange(n_recurrent_layers):
                    l_fwd = lasagne.layers.LSTMLayer(prev_fwd,
                                                     hidden_size,
                                                     backwards=False,
                                                     learn_init=True,
                                                     peepholes=True)

                    l_bck = lasagne.layers.LSTMLayer(prev_bck,
                                                     hidden_size,
                                                     backwards=True,
                                                     learn_init=True,
                                                     peepholes=True)
                    prev_fwd, prev_bck = l_fwd, l_bck
            else:
                prev_fwd, prev_bck = l_in, l_in
                for _ in xrange(n_recurrent_layers):
                    l_fwd = lasagne.layers.RecurrentLayer(
                        prev_fwd,
                        hidden_size,
                        nonlinearity=lasagne.nonlinearities.tanh,
                        W_hid_to_hid=lasagne.init.Orthogonal(),
                        W_in_to_hid=lasagne.init.Orthogonal(),
                        backwards=False,
                        learn_init=True)

                    l_bck = lasagne.layers.RecurrentLayer(
                        prev_bck,
                        hidden_size,
                        nonlinearity=lasagne.nonlinearities.tanh,
                        W_hid_to_hid=lasagne.init.Orthogonal(),
                        W_in_to_hid=lasagne.init.Orthogonal(),
                        backwards=True,
                        learn_init=True)
                    prev_fwd, prev_bck = l_fwd, l_bck

            l_recurrent = lasagne.layers.ConcatLayer([l_fwd, l_bck])
        else:
            prev_fwd = l_in
            if encoder.find('lstm') > -1:
                for _ in xrange(n_recurrent_layers):
                    l_recurrent = lasagne.layers.LSTMLayer(prev_fwd,
                                                           hidden_size,
                                                           backwards=False,
                                                           learn_init=True,
                                                           peepholes=True)
                    prev_fwd = l_recurrent
            else:
                for _ in xrange(n_recurrent_layers):
                    l_recurrent = lasagne.layers.RecurrentLayer(
                        prev_fwd,
                        hidden_size,
                        nonlinearity=lasagne.nonlinearities.tanh,
                        W_hid_to_hid=lasagne.init.Orthogonal(),
                        W_in_to_hid=lasagne.init.Orthogonal(),
                        backwards=False,
                        learn_init=True)
                    prev_fwd = l_recurrent

        recurrent_size = hidden_size * 2 if is_bidirectional else hidden_size
        if conv_attn:
            l_rconv_in = lasagne.layers.InputLayer(shape=(batch_size, img_h,
                                                          recurrent_size))
            l_rconv_in = lasagne.layers.ReshapeLayer(l_rconv_in,
                                                     shape=(batch_size, 1,
                                                            img_h,
                                                            recurrent_size))
            conv_layers = []
            for filter_size in filter_sizes:
                conv_layer = lasagne.layers.Conv2DLayer(
                    l_rconv_in,
                    num_filters=num_filters,
                    filter_size=(filter_size, recurrent_size),
                    stride=(1, 1),
                    nonlinearity=lasagne.nonlinearities.rectify,
                    border_mode='valid')
                pool_layer = lasagne.layers.MaxPool2DLayer(
                    conv_layer, pool_size=(img_h - filter_size + 1, 1))
                conv_layers.append(pool_layer)

            l_hidden1 = lasagne.layers.ConcatLayer(conv_layers)
            l_hidden2 = lasagne.layers.DenseLayer(
                l_hidden1,
                num_units=hidden_size,
                nonlinearity=lasagne.nonlinearities.tanh)
            l_out = l_hidden2
        else:
            l_out = l_recurrent

        if conv_attn:
            e_context = l_recurrent.get_output(c_input,
                                               mask=c_mask,
                                               deterministic=False)
            e_response = l_recurrent.get_output(r_input,
                                                mask=r_mask,
                                                deterministic=False)

            def step_fn(row_t, mask_t):
                return row_t * mask_t.reshape((-1, 1))

            if is_bidirectional:
                e_context, _ = theano.scan(step_fn,
                                           outputs_info=None,
                                           sequences=[
                                               e_context,
                                               T.concatenate([c_mask, c_mask],
                                                             axis=1)
                                           ])
                e_response, _ = theano.scan(step_fn,
                                            outputs_info=None,
                                            sequences=[
                                                e_response,
                                                T.concatenate([r_mask, r_mask],
                                                              axis=1)
                                            ])
            else:
                e_context, _ = theano.scan(step_fn,
                                           outputs_info=None,
                                           sequences=[e_context, c_mask])
                e_response, _ = theano.scan(step_fn,
                                            outputs_info=None,
                                            sequences=[e_response, r_mask])

            e_context = l_out.get_output(e_context,
                                         mask=c_mask,
                                         deterministic=False)
            e_response = l_out.get_output(e_response,
                                          mask=r_mask,
                                          deterministic=False)
        else:
            e_context = l_out.get_output(
                c_input, mask=c_mask,
                deterministic=False)[T.arange(batch_size), c_seqlen].reshape(
                    (c.shape[0], hidden_size))
            e_response = l_out.get_output(
                r_input, mask=r_mask,
                deterministic=False)[T.arange(batch_size), r_seqlen].reshape(
                    (r.shape[0], hidden_size))

        if encoder.find('cnn') > -1:
            e_conv_context = l_conv.get_output(c_input, deterministic=False)
            e_conv_response = l_conv.get_output(r_input, deterministic=False)
            if encoder.find('rnn') > -1 or encoder.find('lstm') > -1:
                if elemwise_sum:
                    e_context = e_context + e_conv_context
                    e_response = e_response + e_conv_response
                else:
                    e_context = T.concatenate([e_context, e_conv_context],
                                              axis=1)
                    e_response = T.concatenate([e_response, e_conv_response],
                                               axis=1)

                # penalize correlation
                if abs(corr_penalty) > 0:
                    cor = []
                    for i in range(hidden_size if elemwise_sum else 2 *
                                   hidden_size):
                        y1, y2 = e_context, e_response
                        x1 = y1[:, i] - (np.ones(batch_size) *
                                         (T.sum(y1[:, i]) / batch_size))
                        x2 = y2[:, i] - (np.ones(batch_size) *
                                         (T.sum(y2[:, i]) / batch_size))
                        nr = T.sum(x1 * x2) / (T.sqrt(T.sum(x1 * x1)) *
                                               T.sqrt(T.sum(x2 * x2)))
                        cor.append(-nr)
                if abs(xcov_penalty) > 0:
                    e_context_mean = T.mean(e_context, axis=0, keepdims=True)
                    e_response_mean = T.mean(e_response, axis=0, keepdims=True)
                    e_context_centered = e_context - e_context_mean  # (n, i)
                    e_response_centered = e_response - e_response_mean  # (n, j)

                    outer_prod = (e_context_centered.dimshuffle(0, 1, 'x') *
                                  e_response_centered.dimshuffle(0, 'x', 1)
                                  )  # (n, i, j)
                    xcov = T.sum(T.sqr(T.mean(outer_prod, axis=0)))
            else:
                e_context = e_conv_context
                e_response = e_conv_response

        dp = T.batched_dot(e_context, T.dot(e_response, self.M.T))
        #dp = pp('dp')(dp)
        o = T.nnet.sigmoid(dp)
        o = T.clip(o, 1e-7, 1.0 - 1e-7)

        self.shared_data = {}
        for key in ['c', 'r']:
            self.shared_data[key] = theano.shared(
                np.zeros((batch_size, img_h), dtype=np.int32))
        for key in ['c_mask', 'r_mask']:
            self.shared_data[key] = theano.shared(
                np.zeros((batch_size, img_h), dtype=theano.config.floatX))
        for key in ['y', 'c_seqlen', 'r_seqlen']:
            self.shared_data[key] = theano.shared(
                np.zeros((batch_size, ), dtype=np.int32))

        self.probas = T.concatenate([(1 - o).reshape(
            (-1, 1)), o.reshape((-1, 1))],
                                    axis=1)
        self.pred = T.argmax(self.probas, axis=1)
        self.errors = T.sum(T.neq(self.pred, y))
        self.cost = T.nnet.binary_crossentropy(o, y).mean()
        if encoder.find('cnn') > -1 and (encoder.find('rnn') > -1
                                         or encoder.find('lstm') > -1):
            if abs(corr_penalty) > 0:
                self.cost += corr_penalty * T.sum(cor)
            if abs(xcov_penalty) > 0:
                self.cost += xcov_penalty * xcov
        self.l_out = l_out
        self.l_recurrent = l_recurrent
        self.embeddings = embeddings
        self.c = c
        self.r = r
        self.y = y
        self.c_seqlen = c_seqlen
        self.r_seqlen = r_seqlen
        self.c_mask = c_mask
        self.r_mask = r_mask

        self.update_params()
def diag_normal_nll(z, z_mu, z_log_sigma):
    nll = 0.5 * T.sum(z_log_sigma, axis=1) + \
          T.sum(T.sqr((z - z_mu) / (1e-6 + T.exp(z_log_sigma))), axis=1) / 2.
    return nll
Exemplo n.º 46
0
    def __init__(self, numpy_rng, theano_rng=None, cfg = None, testing = False, input = None):

        self.cfg = cfg
        self.params = []
        self.delta_params   = []
        self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs
        self.l1_reg = cfg.l1_reg
        self.l2_reg = cfg.l2_reg
        self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size
        self.max_col_norm = cfg.max_col_norm

        self.layers = []
        self.conv_layers = []
        self.lstm_layers = []
        self.fc_layers = []

        # 1. conv 
        self.conv_layer_configs = cfg.conv_layer_configs
        self.conv_activation = cfg.conv_activation
        self.conv_layers_number = len(self.conv_layer_configs)
        self.use_fast = cfg.use_fast
        # 2. lstm
        self.lstm_layers_sizes = cfg.lstm_layers_sizes
        self.lstm_layers_number = len(self.lstm_layers_sizes)
        # 3. dnn
        self.hidden_layers_sizes = cfg.hidden_layers_sizes
        self.hidden_layers_number = len(self.hidden_layers_sizes)
        self.activation = cfg.activation

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        if input == None:
            self.x = T.matrix('x')
        else:
            self.x = input
        self.y = T.matrix('y') 
       
        #######################
        # build conv layers   #
        #######################
        print '1. start to build conv layer: '+ str(self.conv_layers_number)
        for i in xrange(self.conv_layers_number):
            if i == 0:
                input = self.x
            else:
                input = self.conv_layers[-1].output
            config = self.conv_layer_configs[i]
            conv_layer = ConvLayer(numpy_rng=numpy_rng, input=input,
								input_shape = config['input_shape'], 
								filter_shape = config['filter_shape'], 
								poolsize = config['poolsize'],
								activation = self.conv_activation, 
								flatten = config['flatten'], 
								use_fast = self.use_fast, testing = testing)
            print '\tbuild conv layer: ' +str(config['input_shape'])
            self.layers.append(conv_layer)
            self.conv_layers.append(conv_layer)
            self.params.extend(conv_layer.params)
            self.delta_params.extend(conv_layer.delta_params)

        self.conv_output_dim = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3]
        print '\t cnn out: '+ str(self.conv_output_dim)
        cfg.n_ins = config['output_shape'][1] * config['output_shape'][2] * config['output_shape'][3]
        print '1. finish conv layer: '+ str(self.layers[-1].n_out)

        #######################
        # build lstm layers   #
        #######################
        print '2. start to build lstm layer: '+ str(self.lstm_layers_number)
        for i in xrange(self.lstm_layers_number):
            if i == 0:
                input_size = self.conv_output_dim 
                input = self.layers[-1].output
            else:
                input_size = self.lstm_layers_sizes[i - 1]
                input = self.layers[-1].output
            print 'build lstm layer: ' + str(input_size)
            lstm_layer = LSTMLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.lstm_layers_sizes[i])
            print '\tbuild lstm layer: ' + str(input_size) +' x '+ str(lstm_layer.n_out)
            self.layers.append(lstm_layer)
            self.lstm_layers.append(lstm_layer)
            self.params.extend(lstm_layer.params)
            self.delta_params.extend(lstm_layer.delta_params)
        print '2. finish lstm layer: '+ str(self.layers[-1].n_out)

        #######################
        # build dnnv layers   #
        #######################
        print '3. start to build dnnv layer: '+ str(self.hidden_layers_number)
        for i in xrange(self.hidden_layers_number):
            if i == 0:
                input_size = self.layers[-1].n_out
            else:
                input_size = self.hidden_layers_sizes[i - 1]
            input = self.layers[-1].output
            fc_layer = HiddenLayer(rng=numpy_rng, input=input, n_in=input_size, n_out=self.hidden_layers_sizes[i])
            print '\tbuild dnnv layer: ' + str(input_size) +' x '+ str(fc_layer.n_out)
            self.layers.append(fc_layer)
            self.fc_layers.append(fc_layer)
            self.params.extend(fc_layer.params)
            self.delta_params.extend(fc_layer.delta_params)
        print '3. finish dnnv layer: '+ str(self.layers[-1].n_out)

        #######################
        # build log layers   #
        #######################
        print '4. start to build log layer: 1'
        input_size = self.layers[-1].n_out
        input = self.layers[-1].output
        logLayer = OutputLayer(input=input, n_in=input_size, n_out=self.n_outs)
        print '\tbuild final layer: ' + str(input_size) +' x '+ str(fc_layer.n_out)
        self.layers.append(logLayer)
        self.params.extend(logLayer.params)
        self.delta_params.extend(logLayer.delta_params)
        print '4. finish log layer: '+ str(self.layers[-1].n_out)
        print 'Total layers: '+ str(len(self.layers))

        sys.stdout.flush()

        self.finetune_cost = self.layers[-1].l2(self.y)
        self.errors = self.layers[-1].errors(self.y)

        if self.l2_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l2_reg * T.sqr(W).sum()
Exemplo n.º 47
0
 def get_local_cost(self):
     er = T.sqr(self.S - T.dot(self.X, self.W)).sum()
     l1 = T.sqrt(T.sqr(self.X) + 1e-6).sum()
     top_down = self.get_top_down_flow()
     return er + .1 * l1 + top_down
Exemplo n.º 48
0
    def train_batch(self, dataset, batch_size):
        """
        .. todo::

            WRITEME
        """
        #TODO-- this results in compilation happening every time learn is
        # called should cache the compilation results, including those
        # inside cg
        X = dataset.get_design_matrix()
        m = X.shape[0]
        assert X.shape[1] == self.nvis

        gamma = N.zeros((batch_size, self.nhid))
        cur_gamma = T.vector(name='cur_gamma')
        cur_v = T.vector(name='cur_v')
        recons = T.dot(cur_gamma, self.W)
        recons.name = 'recons'

        recons_diffs = cur_v - recons
        recons_diffs.name = 'recons_diffs'

        recons_diff_sq = T.sqr(recons_diffs)
        recons_diff_sq.name = 'recons_diff'

        recons_error = T.sum(recons_diff_sq)
        recons_error.name = 'recons_error'

        dict_dists = T.sum(T.sqr(self.W - cur_v), axis=1)
        dict_dists.name = 'dict_dists'

        abs_gamma = abs(cur_gamma)
        abs_gamma.name = 'abs_gamma'

        weighted_dists = T.dot(abs_gamma, dict_dists)
        weighted_dists.name = 'weighted_dists'

        penalty = self.coeff * weighted_dists
        penalty.name = 'penalty'

        #prevent directions of absolute flatness in the hessian
        #W_sq = T.sqr(self.W)
        #W_sq.name = 'W_sq'
        #debug =  T.sum(W_sq)
        debug = 1e-10 * T.sum(dict_dists)
        debug.name = 'debug'

        #J = debug
        J = recons_error + penalty + debug
        J.name = 'J'

        Jf = function([cur_v, cur_gamma], J)

        start = self.rng.randint(m - batch_size + 1)
        batch_X = X[start:start + batch_size, :]

        #TODO-- optimize gamma
        logger.info('optimizing gamma')
        for i in xrange(batch_size):
            #print str(i+1)+'/'+str(batch_size)
            gamma[i, :] = self.optimize_gamma(batch_X[i, :])

        logger.info('max min')
        logger.info(N.abs(gamma).min(axis=0).max())
        logger.info('min max')
        logger.info(N.abs(gamma).max(axis=0).max())

        #Optimize W
        logger.info('optimizing W')
        logger.warning("not tested since switching to Razvan's all-theano "
                       "implementation of linear cg")
        cg.linear_cg(J, [self.W], max_iters=3)

        err = 0.

        for i in xrange(batch_size):
            err += Jf(batch_X[i, :], gamma[i, :])
        assert not N.isnan(err)
        assert not N.isinf(err)
        logger.info('err: {0}'.format(err))
        return True
Exemplo n.º 49
0
    def get_layer_monitoring_channels(self, state_below=None,
                                      state=None, targets=None):

        #sc = abs(self.Xout).sum() #Get last local_error get_local_error()
        #le = self.local_reconstruction_error 
        W, = self.transformer.get_params()

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        row_norms_min = row_norms.min()
        row_norms_min.__doc__ = ("The smallest norm of any row of the "
                                 "weight matrix W. This is a measure of the "
                                 "least influence any visible unit has.")
        '''
        rval = OrderedDict([('row_norms_min',  row_norms_min),
                            ('row_norms_mean', row_norms.mean()),
                            ('row_norms_max',  row_norms.max()),
                            ('col_norms_min',  col_norms.min()),
                            ('col_norms_mean', col_norms.mean()),
                            ('col_norms_max',  col_norms.max())])#,
                            #('sparse_code_l1_norm', sc.mean())])
        '''
        rval = OrderedDict()

        
        if False:
            #(state is not None) or (state_below is not None):
            if state is None:
                state = self.fprop(state_below)

            P = state
            #if self.pool_size == 1:
            vars_and_prefixes = [(P, '')]
            #else:
            #    vars_and_prefixes = [(P, 'p_')]

            for var, prefix in vars_and_prefixes:
                v_max = var.max(axis=0)
                v_min = var.min(axis=0)
                v_mean = var.mean(axis=0)
                v_range = v_max - v_min

                # max_x.mean_u is "the mean over *u*nits of the max over
                # e*x*amples" The x and u are included in the name because
                # otherwise its hard to remember which axis is which when
                # reading the monitor I use inner.outer
                # rather than outer_of_inner or
                # something like that because I want mean_x.* to appear next to
                # each other in the alphabetical list, as these are commonly
                # plotted together
                for key, val in [('max_x.max_u', v_max.max()),
                                 ('max_x.mean_u', v_max.mean()),
                                 ('max_x.min_u', v_max.min()),
                                 ('min_x.max_u', v_min.max()),
                                 ('min_x.mean_u', v_min.mean()),
                                 ('min_x.min_u', v_min.min()),
                                 ('range_x.max_u', v_range.max()),
                                 ('range_x.mean_u', v_range.mean()),
                                 ('range_x.min_u', v_range.min()),
                                 ('mean_x.max_u', v_mean.max()),
                                 ('mean_x.mean_u', v_mean.mean()),
                                 ('mean_x.min_u', v_mean.min())]:
                    rval[prefix+key] = val
       
       
        return rval    
Exemplo n.º 50
0
    def create_updates(self, params, verbose=1):
        """
        This basically creates all the updates and update functions which trainers can iterate 
        upon.

        Args:
            params: Supply learnable active parameters of a network.
            objective: supply a theano graph connecting the params to a loss        
            verbose: Just as always
        """

        # accumulate velocities for momentum
        if verbose >= 3:
            print "... creating internal parameters for all the optimizations"
        velocities = []
        for param in params:
            velocity = theano.shared(
                numpy.zeros(param.get_value(borrow=True).shape,
                            dtype=theano.config.floatX))
            velocities.append(velocity)

        # these are used for second order optimizers.
        accumulator_1 = []
        accumulator_2 = []
        for param in params:
            eps = numpy.zeros_like(param.get_value(borrow=True),
                                   dtype=theano.config.floatX)
            accumulator_1.append(theano.shared(eps, borrow=True))
            accumulator_2.append(theano.shared(eps, borrow=True))

        # these are used for adam.
        timestep = theano.shared(numpy.asarray(0., dtype=theano.config.floatX))
        delta_t = timestep + 1
        b1 = 0.9  # for ADAM
        b2 = 0.999  # for ADAM
        a = T.sqrt(1 - b2**delta_t) / (1 - b1**delta_t)  # for ADAM

        # to avoid division by zero
        fudge_factor = 1e-7
        if verbose >= 3:
            print "... Building backprop network."

        # This is copied straight from my old toolbox: Samosa. I hope this is working correctly.
        # There might be a better way to have written these... different methods for different
        # optimizers perhaps ?
        if verbose >= 3:
            print "... Applying " + self.optimizer_type
            print "... Applying " + self.momentum_type
        self.updates = OrderedDict()
        for velocity, gradient, acc_1, acc_2, param in zip(
                velocities, self.gradients, accumulator_1, accumulator_2,
                params):
            if self.optimizer_type == 'adagrad':
                """ Adagrad implemented from paper:
                John Duchi, Elad Hazan, and Yoram Singer. 2011. Adaptive subgradient methods
                for online learning and stochastic optimization. JMLR
                """
                current_acc_1 = acc_1 + T.sqr(gradient)  # Accumulates Gradient
                self.updates[
                    acc_1] = current_acc_1  # updates accumulation at timestamp

            elif self.optimizer_type == 'rmsprop':
                """ Tieleman, T. and Hinton, G. (2012):
                Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
                Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)"""
                rms_rho = 0.9
                current_acc_1 = rms_rho * acc_1 + (1 -
                                                   rms_rho) * T.sqr(gradient)
                self.updates[acc_1] = current_acc_1

            elif self.optimizer_type == 'sgd':
                current_acc_1 = 1.

            elif self.optimizer_type == 'adam':
                """ Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic optimization." 
                     arXiv preprint arXiv:1412.6980 (2014)."""
                if not self.momentum_type == '_adam':
                    if verbose >= 3 and not self.momentum_type == 'false':
                        print "... ADAM doesn't need explicit momentum. Momentum is removed."
                    self.momentum_type = '_adam'

                current_acc_2 = b1 * acc_2 + (1 - b1) * gradient
                current_acc_1 = b2 * acc_1 + (1 - b2) * T.sqr(gradient)
                self.updates[acc_2] = current_acc_2
                self.updates[acc_1] = current_acc_1

            if self.momentum_type == '_adam':
                self.updates[velocity] = a * current_acc_2 / (
                    T.sqrt(current_acc_1) + fudge_factor)

            elif self.momentum_type == 'false':  # no momentum
                self.updates[velocity] = -(self.learning_rate / T.sqrt(
                    current_acc_1 + fudge_factor)) * gradient
            elif self.momentum_type == 'polyak':  # if polyak momentum
                """ Momentum implemented from paper:  
                Polyak, Boris Teodorovich. "Some methods of speeding up the convergence of 
                iteration methods."  USSR Computational Mathematics and Mathematical 
                Physics 4.5 (1964): 1-17.
    
                Adapted from Sutskever, Ilya, Hinton et al. "On the importance of initialization
                and momentum in deep learning.", Proceedings of the 30th international 
                conference on machine learning (ICML-13). 2013. equation (1) and equation (2)"""

                self.updates[velocity] = self.momentum * velocity - (1.- self.momentum) * \
                                 ( self.learning_rate / T.sqrt(current_acc_1 + fudge_factor)) \
                                                                                    * gradient

            elif self.momentum_type == 'nesterov':  # Nestrov accelerated gradient
                """Nesterov, Yurii. "A method of solving a convex programming problem with 
                convergence rate O (1/k2)." Soviet Mathematics Doklady. Vol. 27. No. 2. 1983.
                Adapted from 
                https://blogs.princeton.edu/imabandit/2013/04/01/acceleratedgradientdescent/ 
    
                Instead of using past params we use the current params as described in this link
                https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617,"""

                self.updates[velocity] = self.momentum * velocity - (1.-self.momentum) * \
                                ( self.learning_rate / T.sqrt(current_acc_1 + fudge_factor)) \
                                                                                    * gradient
                self.updates[param] = self.momentum * self.updates[velocity]

            else:
                if verbose >= 3:
                    print "... Unrecognized mometum type, switching to no momentum."
                self.momentum_type = 'false'
                self.updates[velocity] = -(self.learning_rate / T.sqrt(
                    current_acc_1 + fudge_factor)) * gradient
            stepped_param = param + self.updates[velocity]
            if self.momentum_type == 'nesterov':
                stepped_param = stepped_param + self.updates[param]

            column_norm = True  #This I don't fully understand if
            #its needed after BN is implemented.
            # This is been around since my first ever
            # implementation of samosa, and I haven't tested it out.
            if param.get_value(borrow=True).ndim == 2 and column_norm is True:
                """ constrain the norms of the COLUMNs of the weight, according to
                https://github.com/BVLC/caffe/issues/109 """
                col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
                desired_norms = T.clip(col_norms, 0, T.sqrt(15))
                scale = desired_norms / (fudge_factor + col_norms)
                self.updates[param] = stepped_param * scale
            else:
                self.updates[param] = stepped_param

        if self.optimizer_type == 'adam':
            self.updates[timestep] = delta_t
Exemplo n.º 51
0
def GaussianNLL(y, mu, sig):

    nll = 0.5 * T.sum(
        T.sqr(y - mu) / sig**2 + 2 * T.log(sig) + T.log(2 * numpy.pi), axis=0)
    return nll
Exemplo n.º 52
0
 def get_local_cost(self):
     er = T.sqr(self.S - self.transformer.lmul(self.X)).sum()
     l1 = T.sqrt( T.sqr(self.X) + 1e-6).sum()
     top_down = self.get_top_down_flow()
     return er + .1 * l1 + top_down
Exemplo n.º 53
0
    cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation)

    cnn = binary_net.DenseLayer(cnn,
                                binary=binary,
                                stochastic=stochastic,
                                H=H,
                                W_LR_scale=W_LR_scale,
                                nonlinearity=lasagne.nonlinearities.identity,
                                num_units=10)

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    train_output = lasagne.layers.get_output(cnn, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if binary:

        # W updates
        W = lasagne.layers.get_all_params(cnn, binary=True)
        W_grads = binary_net.compute_grads(loss, cnn)
        updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                       params=W,
                                       learning_rate=LR)
        updates = binary_net.clipping_scaling(updates, cnn)

        # other parameters updates
        params = lasagne.layers.get_all_params(cnn,
                                               trainable=True,
                                               binary=False)
Exemplo n.º 54
0
def inner_fn(t, stm1, postm1, vtm1):

    # Use hidden state to generate action state
    aht = T.dot(Wa_aht_st, T.reshape(stm1, (n_s, n_proc))) + ba_aht
    #aht2 = T.dot(Wa_aht2_aht, T.reshape(aht,(n_s,n_proc))) + ba_aht2
    #aht3 = T.dot(Wa_aht3_aht2, T.reshape(aht2,(n_s,n_proc))) + ba_aht3
    atm1_mu = T.dot(Wa_atmu_aht, T.reshape(aht, (n_s, n_proc))) + ba_atmu
    atm1_sig = T.nnet.softplus(
        T.dot(Wa_atsig_aht, T.reshape(aht, (n_s, n_proc))) +
        ba_atsig) + sig_min_action

    # Sample Action
    atm1 = atm1_mu + theano_rng.normal((n_oa, n_proc)) * atm1_sig

    # Update Environment
    action_force = T.tanh(atm1)
    force = T.switch(
        T.lt(postm1, 0.0), -2 * postm1 - 1, -T.pow(1 + 5 * T.sqr(postm1), -0.5)
        - T.sqr(postm1) * T.pow(1 + 5 * T.sqr(postm1), -1.5) -
        T.pow(postm1, 4) / 16.0) - 0.25 * vtm1
    vt = vtm1 + 0.05 * force + 0.03 * action_force
    post = postm1 + vt

    # Generate Sensory Inputs:

    # 1.) Observation of Last Action
    oat = atm1

    # 2.) Noisy Observation of Current Position
    ot = post + theano_rng.normal((n_o, n_proc)) * 0.01

    # 3.) Nonlinear Transformed Sensory Channel
    oht = T.exp(-T.sqr(post - 1.0) / 2.0 / 0.3 / 0.3)

    # Infer hidden state from last hidden state and current observations, using variational density
    hst = T.nnet.relu(
        T.dot(Wq_hst_stm1, T.reshape(stm1, (n_s, n_proc))) +
        T.dot(Wq_hst_ot, T.reshape(ot, (n_o, n_proc))) +
        T.dot(Wq_hst_oht, T.reshape(oht, (n_oh, n_proc))) +
        T.dot(Wq_hst_oat, T.reshape(oat, (n_oa, n_proc))) + bq_hst)
    hst2 = T.nnet.relu(
        T.dot(Wq_hst2_hst, T.reshape(hst, (n_s, n_proc))) + bq_hst2)

    stmu = T.tanh(
        T.dot(Wq_stmu_hst2, T.reshape(hst2, (n_s, n_proc))) + bq_stmu)
    stsig = T.nnet.softplus(
        T.dot(Wq_stsig_hst2, T.reshape(hst2, (n_s, n_proc))) +
        bq_stsig) + sig_min_states

    # Explicitly encode position as homeostatic state variable
    # Rescale representation to fit within linear response of the tanh-nonlinearity
    stmu = T.set_subtensor(stmu[0, :], 0.1 * ot[0, :]).reshape((n_s, n_proc))
    stsig = T.set_subtensor(stsig[0, :], 0.005).reshape((n_s, n_proc))

    # Sample from variational density
    st = stmu + theano_rng.normal((n_s, n_proc)) * stsig

    # Calculate parameters of likelihood distributions from sampled state
    ost = T.nnet.relu(T.dot(Wl_ost_st, T.reshape(st, (n_s, n_proc))) + bl_ost)
    ost2 = T.nnet.relu(
        T.dot(Wl_ost2_ost, T.reshape(ost, (n_s, n_proc))) + bl_ost2)
    ost3 = T.nnet.relu(
        T.dot(Wl_ost3_ost2, T.reshape(ost2, (n_s, n_proc))) + bl_ost3)

    otmu = T.dot(Wl_otmu_st, T.reshape(ost3, (n_s, n_proc))) + bl_otmu
    otsig = T.nnet.softplus(
        T.dot(Wl_otsig_st, T.reshape(ost3, (n_s, n_proc))) +
        bl_otsig) + sig_min_obs

    ohtmu = T.dot(Wl_ohtmu_st, T.reshape(ost3, (n_s, n_proc))) + bl_ohtmu
    ohtsig = T.nnet.softplus(
        T.dot(Wl_ohtsig_st, T.reshape(ost3, (n_s, n_proc))) +
        bl_ohtsig) + sig_min_obs

    oatmu = T.dot(Wl_oatmu_st, T.reshape(ost3, (n_s, n_proc))) + bl_oatmu
    oatsig = T.nnet.softplus(
        T.dot(Wl_oatsig_st, T.reshape(ost3, (n_s, n_proc))) +
        bl_oatsig) + sig_min_obs

    # Calculate negative log-likelihood of observations
    p_ot = GaussianNLL(ot, otmu, otsig)
    p_oht = GaussianNLL(oht, ohtmu, ohtsig)
    p_oat = GaussianNLL(oat, oatmu, oatsig)

    # Calculate prior expectation on hidden state from previous state
    prior_stmu = T.tanh(
        T.dot(Wl_stmu_stm1, T.reshape(stm1, (n_s, n_proc))) + bl_stmu)
    prior_stsig = T.nnet.softplus(
        T.dot(Wl_stsig_stm1, T.reshape(stm1, (n_s, n_proc))) +
        bl_stsig) + sig_min_states

    # Explicitly encode expectations on homeostatic state variable
    prior_stmu = ifelse(T.lt(t, 20), prior_stmu,
                        T.set_subtensor(prior_stmu[0, :], 0.1))
    prior_stsig = ifelse(T.lt(t, 20), prior_stsig,
                         T.set_subtensor(prior_stsig[0, :], 0.005))

    # Calculate KL divergence between variational density and prior density
    # using explicit formula for diagonal gaussians
    KL_st = KLGaussianGaussian(stmu, stsig, prior_stmu, prior_stsig)

    # Put free energy functional together
    FEt = KL_st + p_ot + p_oht + p_oat

    return st, post, vt, oat, ot, oht, FEt, KL_st, stmu, stsig, force, p_ot, p_oht, p_oat
Exemplo n.º 55
0
    def __init__(self, numpy_rng, theano_rng=None,
                 cfg = None,  # the network configuration
                 dnn_shared = None, shared_layers=[], input = None):

        self.layers = []
        self.params = []
        self.delta_params = []
        self.rnn_layerX = 2
        print "Use DRN 2"

        self.cfg = cfg
        self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs
        self.hidden_layers_sizes = cfg.hidden_layers_sizes
        self.hidden_layers_number = len(self.hidden_layers_sizes)
        self.activation = cfg.activation

        self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size

        self.max_col_norm = cfg.max_col_norm
        self.l1_reg = cfg.l1_reg
        self.l2_reg = cfg.l2_reg

        self.non_updated_layers = cfg.non_updated_layers

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        if input == None:
            self.x = T.matrix('x')
        else:
            self.x = input 
        self.y = T.matrix('y')

        for i in xrange(self.hidden_layers_number):
            # construct the hidden layer
            if i == 0:
                input_size = self.n_ins
                layer_input = self.x
            else:
                input_size = self.hidden_layers_sizes[i - 1]
                layer_input = self.layers[-1].output

            W = None; b = None
            if (i in shared_layers) :
                W = dnn_shared.layers[i].W; b = dnn_shared.layers[i].b
			
            if i == self.rnn_layerX:
                hidden_layer = RnnLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i],
                                        W = W, b = b,
                                        activation=self.activation)
            else:
            	if self.do_maxout == True:
                	hidden_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i] * self.pool_size,
                                        W = W, b = b,
                                        activation = (lambda x: 1.0*x),
                                        do_maxout = True, pool_size = self.pool_size)
            	else:
                	hidden_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i],
                                        W = W, b = b,
                                        activation=self.activation)
            # add the layer to our list of layers
            self.layers.append(hidden_layer)
            # if the layer index is included in self.non_updated_layers, parameters of this layer will not be updated
            if (i not in self.non_updated_layers):
                self.params.extend(hidden_layer.params)
                self.delta_params.extend(hidden_layer.delta_params)
        # We now need to add a logistic layer on top of the MLP
        self.logLayer = OutputLayer(
                         input=self.layers[-1].output,
                         n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs)

        if self.n_outs > 0:
            self.layers.append(self.logLayer)
            self.params.extend(self.logLayer.params)
            self.delta_params.extend(self.logLayer.delta_params)
       
        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.l2(self.y)
        self.errors = self.logLayer.errors(self.y)

        if self.l1_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l1_reg * (abs(W).sum())

        if self.l2_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l2_reg * T.sqr(W).sum()
Exemplo n.º 56
0
def RNN():
    # First, we build the network, for first sentence starting with an input layer
    # Recurrent layers expect input of shape
    # (batch size, max sequence length, number of features)
    #Giving the batch size as None because we are still experimenting with the
    # meaning and true usage of the parameter
    #Sequence length corresponds to time steps but this would be variable and it would depend
    #upon the input length so lets give it as None
    #Number of features are 300 because each word is a vector of 300 dimensions

    W = lasagne.init.HeUniform()
    l_in_1 = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH, N_FEATURES))
    l_mask_1 = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH))
    l_forward_1 = lasagne.layers.RecurrentLayer(
        l_in_1,
        N_HIDDEN,
        mask_input=l_mask_1,
        grad_clipping=GRAD_CLIP,
        W_in_to_hid=W,
        W_hid_to_hid=lasagne.init.HeUniform(),
        nonlinearity=lasagne.nonlinearities.tanh)
    l_forward_2_1 = lasagne.layers.RecurrentLayer(
        l_forward_1,
        N_HIDDEN,
        mask_input=l_mask_1,
        grad_clipping=GRAD_CLIP,
        W_in_to_hid=W,
        W_hid_to_hid=lasagne.init.HeUniform(),
        nonlinearity=lasagne.nonlinearities.tanh)
    #     l_forward_3_1 = lasagne.layers.RecurrentLayer(
    #         l_forward_2_1, N_HIDDEN, mask_input=l_mask_1, grad_clipping=GRAD_CLIP,
    #         W_in_to_hid=W,
    #         W_hid_to_hid=lasagne.init.HeUniform(),
    #         nonlinearity=lasagne.nonlinearities.tanh)
    l_out_1 = lasagne.layers.SliceLayer(l_forward_2_1, -1, 1)
    #l_out_1 = lasagne.layers.DenseLayer(l_forward_1, num_units=n_output)

    l_in_2 = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH, N_FEATURES))
    l_mask_2 = lasagne.layers.InputLayer(shape=(None, MAX_LENGTH))
    l_forward_2 = lasagne.layers.RecurrentLayer(
        l_in_2,
        N_HIDDEN,
        mask_input=l_mask_2,
        grad_clipping=GRAD_CLIP,
        W_in_to_hid=lasagne.init.HeUniform(),
        W_hid_to_hid=lasagne.init.HeUniform(),
        nonlinearity=lasagne.nonlinearities.tanh)
    l_forward_2_2 = lasagne.layers.RecurrentLayer(
        l_forward_2,
        N_HIDDEN,
        mask_input=l_mask_2,
        grad_clipping=GRAD_CLIP,
        W_in_to_hid=lasagne.init.HeUniform(),
        W_hid_to_hid=lasagne.init.HeUniform(),
        nonlinearity=lasagne.nonlinearities.tanh)
    #     l_forward_3_2 = lasagne.layers.RecurrentLayer(
    #         l_forward_2_2, N_HIDDEN, mask_input=l_mask_2, grad_clipping=GRAD_CLIP,
    #         W_in_to_hid=lasagne.init.HeUniform(),
    #         W_hid_to_hid=lasagne.init.HeUniform(),
    #         nonlinearity=lasagne.nonlinearities.tanh)
    l_out_2 = lasagne.layers.SliceLayer(l_forward_2_2, -1, 1)
    #l_out_2 = lasagne.layers.DenseLayer(l_forward_2, num_units=n_output)

    #target cosine similarity of the pair of sentence
    target_values = T.vector('target_output')
    network_output_1 = lasagne.layers.get_output(l_out_1)
    #network_output_1 = lasagne.layers.get_output(l_out_1)
    network_output_2 = lasagne.layers.get_output(l_out_2)
    #network_output_2 = lasagne.layers.get_output(l_out_2)
    mod_y_1 = T.sqrt(T.sum(T.sqr(network_output_1), 1))
    mod_y_2 = T.sqrt(T.sum(T.sqr(network_output_2), 1))
    cosine_simi = T.sum(network_output_1 * network_output_2,
                        axis=1) / (mod_y_1 * mod_y_2)
    cost = T.mean((cosine_simi - target_values)**2)
    # cosine_sim = T.sum(network_output_1*network_output_2,axis = 1)
    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(
        l_out_1) + lasagne.layers.get_all_params(l_out_2)
    # Compute SGD updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)
    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function([
        l_in_1.input_var, l_in_2.input_var, target_values, l_mask_1.input_var,
        l_mask_2.input_var
    ],
                            cost,
                            updates=updates,
                            on_unused_input='warn')
    #     compute_cost = theano.function(
    #         [l_in_1.input_var, l_in_2.input_var, target_values, l_mask_1.input_var,
    #                               l_mask_2.input_var], cost, on_unused_input='warn')
    #
    test_cosine = theano.function([
        l_in_1.input_var, l_in_2.input_var, target_values, l_mask_1.input_var,
        l_mask_2.input_var
    ],
                                  cosine_simi,
                                  on_unused_input='warn')

    train_sentence_1, train_sentence_2, cosineSimtrain, mask_train_1, mask_train_2 \
           ,test_sentence_undergoer_1,  test_sentence_undergoer_2 \
           ,cosineSimUndergoer, mask_undergoer_test_1, mask_undergoer_test_2 \
           ,test_sentence_trigger_1,  test_sentence_trigger_2 \
           ,cosineSimTrigger, mask_trigger_test_1, mask_trigger_test_2\
           ,test_sentence_enabler_1,  test_sentence_enabler_2, \
           cosineSimEnabler, mask_enabler_test_1, mask_enabler_test_2\
           ,test_sentence_result_1,  test_sentence_result_2, \
           cosineSimResult, mask_result_test_1, mask_result_test_2,\
            test_df = gen_csvdata()

    print("Training ...")
    try:
        for epoch in range(num_epochs):
            cost_val = train(train_sentence_1, train_sentence_2,
                             cosineSimtrain, mask_train_1, mask_train_2)
            #cost_val = compute_cost(train_sentence_1, train_sentence_2, cosineSimtrain, mask_train_1,mask_train_2 )
            print("Epoch {} validation cost = {}".format(epoch, cost_val))
            if epoch % 100 == 0:
                cosine_undergoersim = test_cosine(test_sentence_undergoer_1,  test_sentence_undergoer_2,\
                              cosineSimUndergoer, mask_undergoer_test_1, mask_undergoer_test_2)
                cosine_enablersim = test_cosine(test_sentence_enabler_1,  test_sentence_enabler_2,\
                              cosineSimEnabler, mask_enabler_test_1, mask_enabler_test_2)
                cosine_triggersim = test_cosine(test_sentence_trigger_1,  test_sentence_trigger_2,\
                              cosineSimTrigger, mask_trigger_test_1, mask_trigger_test_2)
                cosine_resultsim = test_cosine(test_sentence_result_1,  test_sentence_result_2,\
                              cosineSimResult, mask_result_test_1, mask_result_test_2)
                test_df["newUndergoerScore"] = cosine_undergoersim
                test_df["newEnablerScore"] = cosine_enablersim
                test_df["newTriggerScore"] = cosine_triggersim
                test_df["newResultScore"] = cosine_resultsim
                test_df["avgOurScore"] = test_df.apply(averageFinalScore,
                                                       axis=1)
                directory = "newresult/prediction/deep2RNNphys_wordvec200/" + str(
                    epoch)
                if not os.path.exists(directory):
                    os.makedirs(directory)
                test_df.to_csv(directory + "/cosineSimilarity.csv")

    except KeyboardInterrupt:
        pass
Exemplo n.º 57
0
    def squared_errors(self,y):
        """ Returns the mean of squared errors of the linear regression on this data. """
        #return  (T.mean(T.sqr(self.y_pred - y),axis=0))

        # return T.mean(T.sqr(self.y_pred - y),axis=1)
        return T.mean(T.sqr(self.y_pred - y))
Exemplo n.º 58
0
    def init_updates(self):
        gparams = T.grad(self.cost, self.params, disconnected_inputs='warn')

        # Remove NaNs
        if self.nan_protection:
            gparams = [
                T.switch(T.isnan(g) or T.isinf(g), 0., g) for g in gparams
            ]

        # gradient clipping (grad_norm)
        if self.grad_norm_clip is not None:
            grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
            grad_norm = T.sqrt(grad_norm)
            scale = self.grad_norm_clip / T.maximum(self.grad_norm_clip,
                                                    grad_norm)
            gparams = [g * scale for g in gparams]

        # Gradient clipping (hard)
        if self.grad_clip is not None:
            gparams = [T.minimum(g, self.grad_clip) for g in gparams]
            gparams = [T.maximum(g, -1. * self.grad_clip) for g in gparams]

        lr = defaultdict(lambda *args: self.lr)
        try:
            lr.update(dict(self.notifier.notify(Notifier.LEARNING_RATES)))
        except Exception:
            pass

        mult = defaultdict(lambda *args: np.cast[fx](1))
        try:
            mult.update(dict(self.notifier.notify(Notifier.PARAM_MULT)))
        except Exception:
            pass

        mom = defaultdict(lambda *args: self.mom)
        try:
            mom.update(dict(self.notifier.notify(Notifier.MOMENTUM)))
        except Exception:
            pass

        # Parameter updates
        updates_param = [
            (param, param * mult[param.name] - \
             lr[param.name] * ((1 - mom[param.name]) * gparam_cur + mom[param.name] * gparam_last))
            for param, gparam_cur, gparam_last in zip(self.params, gparams, self.gparams)
        ]

        # gradient updates for momentum
        updates_gparam = [
            (gparam_last, gparam_cur)
            for gparam_last, gparam_cur in zip(self.gparams, gparams)
        ]

        updates = updates_param + updates_gparam

        # Callback to an external function. E.g. there are non-integrable nodes
        # which should be added after the gradient calculation.
        if self.notifier is not None:
            if len(self.notifier.callbacks[Notifier.GRADIENT_CALCULATED]) > 0:
                grads_new = self.notifier.notify(Notifier.GRADIENT_CALCULATED,
                                                 updates)
                if grads_new is not None and len(grads_new) > 0:
                    updates = np.vstack(grads_new)

        # ensure that the broadcastpattern before and after the update is identical
        updates = [(k, T.patternbroadcast(v, k.broadcastable))
                   for k, v in updates]

        validate = self.validate if self.validate is not None else self.cost

        return theano.function(inputs=self.variables.values(),
                               outputs=validate,
                               updates=updates,
                               allow_input_downcast=True,
                               on_unused_input='warn')
Exemplo n.º 59
0
 def fn(images):
     return T.sum(T.sqr(images2neibs(images, (2, 2),
                                     mode='ignore_borders')),
                  axis=[0, 1])
Exemplo n.º 60
0
 def reg_mse(self, y):
     """ Returns the mean of squared errors of the linear regression with l1 and l2 regularization on this data. """
     L1 = T.sum(abs(self.y_pred - y))
     L2_sqr = T.sum((self.y_pred - y)**2)
     return T.mean(T.sqr(self.y_pred - y)) + 0.01*L1 + 0.01*L2_sqr