Exemplo n.º 1
0
class ActivationSoftmax(Activation):
    '''Activation function softmax(A)'''
    def __init__(self):
        self._tmp_denom = TempMatrix()

    def name(self):         return "softmax"
    def ideal_domain(self): return [0.0,1.0]
    def ideal_range(self):  return [0.0,1.0]
    def actual_range(self): return [0.0,1.0]
    def ideal_loss(self):   return 'nll'

    def __call__(self,A,out=None,dout=None):
        # First pre-allocate enough memory to accumulate denominator of each sample
        denom = self._tmp_denom.get_capacity(A.shape[0],1)

        # Then compute softmax
        if out == None:
            expA = exp(A)
            sum(expA,axis=1,out=denom)
            return (1./denom) * expA
        exp(A,out=out)
        sum(out,axis=1,out=denom)
        reciprocal(denom,out=denom)
        multiply(out,denom,out=out)
        if dout != None:
            pass # for Softmax+NLL, 'df' is not used to compute Cost.dloss, so don't bother setting the dout in that case
Exemplo n.º 2
0
class ActivationSoftmax(Activation):
    '''Activation function softmax(A)'''
    def __init__(self):
        self._tmp_denom = TempMatrix()

    def name(self):
        return "softmax"

    def ideal_domain(self):
        return [0.0, 1.0]

    def ideal_range(self):
        return [0.0, 1.0]

    def actual_range(self):
        return [0.0, 1.0]

    def ideal_loss(self):
        return 'nll'

    def __call__(self, A, out, dout):
        # First pre-allocate enough memory to accumulate denominator of each sample
        maxval = denom = self._tmp_denom.get_capacity(A.shape[0], 1)

        # Then compute logsum softmax (subtract off maximum value)
        max(A, axis=1, out=maxval)
        subtract(A, maxval, out=out)
        exp(out, out=out)
        sum(out, axis=1, out=denom)
        reciprocal(denom, out=denom)
        multiply(out, denom, out=out)
Exemplo n.º 3
0
class ActivationSoftmax(Activation):
    '''Activation function softmax(A)'''
    def __init__(self):
        self._tmp_denom = TempMatrix()

    def name(self):         return "softmax"
    def ideal_domain(self): return [0.0,1.0]
    def ideal_range(self):  return [0.0,1.0]
    def actual_range(self): return [0.0,1.0]
    def ideal_loss(self):   return 'nll'

    def __call__(self,A,out,dout):
        # First pre-allocate enough memory to accumulate denominator of each sample
        maxval = denom = self._tmp_denom.get_capacity(A.shape[0],1)

        # Then compute logsum softmax (subtract off maximum value)
        max(A,axis=1,out=maxval)
        subtract(A,maxval,out=out)
        exp(out,out=out)
        sum(out,axis=1,out=denom)
        reciprocal(denom,out=denom)
        multiply(out,denom,out=out)
Exemplo n.º 4
0
class Model(object):
    '''
    A trainable model. parameters,
    inputs X, and targets Y.

    Let Z = M(X) be the final outputs, and let H be the hidden activities incurred by X.
    A cost is defined as the sum of three additive components:
       - loss(Z,Y) depends on the final outputs and targets
       - regularizer(H) depends only on the hidden activations
       - penalty(M) depends ony on the model and its own parameters
    '''
    def __init__(self, loss=None):
        if isinstance(loss, str):
            self._loss_type = loss
            self._loss_fn = None
            self._loss_delta_fn = None
            if loss == 'mse':
                self._loss_fn = self._loss_mse
                self._loss_delta_fn = self._loss_delta_mse
            elif loss == 'nll':
                self._loss_fn = self._loss_nll
                self._loss_delta_fn = self._loss_delta_nll
            elif loss:
                raise ValueError("unrecognized loss '%s' requested" %
                                 loss_type)
        else:
            self._loss_type = 'callback'
            self._loss_fn = loss[0]
            self._loss_delta_fn = loss[1]

        self._tmp_E = TempMatrix()  # memory for computing loss
        self._tmp_e = TempMatrix()  # memory for computing loss

    def cost(self, data):
        '''
        Computes the cost = loss(output(X),Y) + regularizer(hidden(X)) + penalty(model)
        '''
        X, Y = data
        H = self.eval(X, want_hidden=True)
        l = self.loss(H[-1], Y)
        r = self.regularizer(H)
        p = self.penalty()
        c = l + r + p
        return c, l, r, p

    def apply_constraints(self):
        pass

    ############### LOSS ###############

    def loss(self, Z, Y):
        '''Loss of outputs Z with respect to targets Y.'''
        return float(self._loss_fn(Z, Y))

    def _loss_delta(self, Z, Y, df, out=None):
        '''Computes the gradient error signal to backpropagate up the network.
           Here df is the derivative f'(A) of the output activation f(A).'''
        return self._loss_delta_fn(Z, Y, df, out)

    def _loss_mse(self, Z, Y):
        """Mean squared error (mse) of ouputs Z with respect to targets Y."""
        E = self._tmp_E.get_capacity(*Z.shape)
        e = self._tmp_e.get_capacity(Z.shape[0], 1)
        subtract(Z, Y, out=E)
        square(E, out=E)
        sum(E, axis=1, out=e)
        return 0.5 * as_numpy(mean(e))  # = mean(sum(square(Z-Y),axis=1))

    def _loss_delta_mse(self, Z, Y, df, out=None):
        """Computes the MSE gradient error signal to backpropagate up the network.
           Here df is the derivative f'(A) of the output activation f(A)."""
        self._loss_delta_nll(Z, Y, df, out)
        imul(out, df)  # = 1/m * (Z-Y) * df    for mse
        return out

    def _loss_nll(self, Z, Y):
        '''Negative log-likelihood of outputs Z with respect to targets Y.'''
        E = self._tmp_E.get_capacity(*Z.shape)
        e = self._tmp_e.get_capacity(Z.shape[0], 1)
        multiply(Z, Y, out=E)
        sum(E, axis=1, out=e)
        log(e, out=e)
        return -as_numpy(mean(e))  # = -mean(log(sum(Z*Y,axis=1)))

    def _loss_delta_nll(self, Z, Y, df, out=None):
        """Computes the NLL gradient error signal to backpropagate up the network."""
        subtract(Z, Y, out=out)
        imul(out, 1. / Z.shape[0])  # = 1/m * (Z-Y)         for nll
        return out

    ############### REGULARIZER ###############

    def regularizer(self, H):
        return 0.0

    ############### PENALTY ###############

    def penalty(self):
        return 0.0

    ################################# UTILITY FUNCTIONS #########################

    def relative_error(self, A, B, abs_eps):
        absA = np.abs(A)
        absB = np.abs(B)
        I = np.logical_not(
            np.logical_or(A == B, np.logical_or(absA < abs_eps,
                                                absB < abs_eps)))
        E = np.zeros(A.shape, dtype=A.dtype)
        E[I] = np.abs(A[I] - B[I]) / min(absA[I] + absB[I])
        return E

    def gradcheck(self, data):
        # Only use a tiny subset of the data, both for speed and to avoid
        # averaging gradient of many inputs.
        data_subset = data[:min(4, data.size)]

        # Swap a new weights object into the model, so that we can purturb the model's weights from outside
        weights0 = self.weights
        self.weights = weights1 = weights0.copy()

        # Compute gradient by forward-difference, looping over each individual weight
        neps = 1e-7
        ngrad = self.make_weights()
        for k in range(len(weights1)):
            w1 = weights1[k]
            wg = ngrad[k]
            for i in range(len(w1)):
                # temporarily perturb parameter w[i] by 'neps' and evaluate the new loss
                temp = w1[i]
                w1[i] -= neps
                c0, l0, r0, p0 = self.cost(data_subset)
                w1[i] = temp
                w1[i] += neps
                c1, l1, r1, p1 = self.cost(data_subset)
                w1[i] = temp
                wg[i] = (c1 - c0) / (2 * neps)

        self.weights = weights0  # restore the original weights object for the model

        # Compute backprop's gradient (bgrad), keeping loss/regularizer/penalty separate
        bgrad = self.grad(data_subset)

        A = ngrad.ravel()
        B = bgrad.ravel()
        aerr = np.abs(A - B)
        rerr = self.relative_error(A, B, 1e-20)
        print 'absolute_error in gradient: min =', np.min(
            aerr), 'max =', np.max(aerr)
        print 'relative_error in gradient: min =', np.min(
            rerr), 'max =', np.max(rerr)
Exemplo n.º 5
0
class Model(object):
    '''
    A trainable model. parameters,
    inputs X, and targets Y.

    Let Z = M(X) be the final outputs, and let H be the hidden activities incurred by X.
    A cost is defined as the sum of three additive components:
       - loss(Z,Y) depends on the final outputs and targets
       - regularizer(H) depends only on the hidden activations
       - penalty(M) depends ony on the model and its own parameters
    '''
    def __init__(self,loss=None):
        if isinstance(loss,str):
            self._loss_type = loss
            self._loss_fn = None
            self._loss_delta_fn = None
            if   loss == 'mse': self._loss_fn = self._loss_mse; self._loss_delta_fn = self._loss_delta_mse
            elif loss == 'nll': self._loss_fn = self._loss_nll; self._loss_delta_fn = self._loss_delta_nll
            elif loss: raise ValueError("unrecognized loss '%s' requested" % loss_type)
        else:
            self._loss_type = 'callback'
            self._loss_fn = loss[0]
            self._loss_delta_fn = loss[1]

        self._tmp_E = TempMatrix()  # memory for computing loss
        self._tmp_e = TempMatrix()  # memory for computing loss

    def cost(self,data):
        '''
        Computes the cost = loss(output(X),Y) + regularizer(hidden(X)) + penalty(model)
        '''
        X,Y = data
        H = self.eval(X,want_hidden=True)
        l = self.loss(H[-1],Y)
        r = self.regularizer(H)
        p = self.penalty()
        c = l+r+p
        return c,l,r,p

    def apply_constraints(self):
        pass

    ############### LOSS ###############

    def loss(self,Z,Y):
        '''Loss of outputs Z with respect to targets Y.'''
        return float(self._loss_fn(Z,Y))

    def _loss_delta(self,Z,Y,df,out=None):
        '''Computes the gradient error signal to backpropagate up the network.
           Here df is the derivative f'(A) of the output activation f(A).'''
        return self._loss_delta_fn(Z,Y,df,out)

    def _loss_mse(self,Z,Y):
        """Mean squared error (mse) of ouputs Z with respect to targets Y."""
        E = self._tmp_E.get_capacity(*Z.shape)
        e = self._tmp_e.get_capacity(Z.shape[0],1)
        subtract(Z,Y,out=E)
        square(E,out=E)
        sum(E,axis=1,out=e)
        return 0.5*as_numpy(mean(e))               # = mean(sum(square(Z-Y),axis=1))

    def _loss_delta_mse(self,Z,Y,df,out=None):
        """Computes the MSE gradient error signal to backpropagate up the network.
           Here df is the derivative f'(A) of the output activation f(A)."""
        self._loss_delta_nll(Z,Y,df,out)
        imul(out,df)                       # = 1/m * (Z-Y) * df    for mse
        return out

    def _loss_nll(self,Z,Y):
        '''Negative log-likelihood of outputs Z with respect to targets Y.'''
        E = self._tmp_E.get_capacity(*Z.shape)
        e = self._tmp_e.get_capacity(Z.shape[0],1)
        multiply(Z,Y,out=E)
        sum(E,axis=1,out=e)
        log(e,out=e)
        return -as_numpy(mean(e))              # = -mean(log(sum(Z*Y,axis=1)))

    def _loss_delta_nll(self,Z,Y,df,out=None):
        """Computes the NLL gradient error signal to backpropagate up the network."""
        subtract(Z,Y,out=out)
        imul(out,1./Z.shape[0])            # = 1/m * (Z-Y)         for nll
        return out


    ############### REGULARIZER ###############

    def regularizer(self,H):
        return 0.0

    ############### PENALTY ###############

    def penalty(self):
        return 0.0

    ################################# UTILITY FUNCTIONS #########################

    def relative_error(self,A,B,abs_eps):
        absA = np.abs(A)
        absB = np.abs(B)
        I = np.logical_not(np.logical_or(A==B,np.logical_or(absA < abs_eps, absB < abs_eps)))
        E = np.zeros(A.shape,dtype=A.dtype)
        E[I] = np.abs(A[I]-B[I]) / min(absA[I] + absB[I])
        return E

    def gradcheck(self,data):
        # Only use a tiny subset of the data, both for speed and to avoid 
        # averaging gradient of many inputs.
        data_subset = data[:min(4,data.size)]

        # Swap a new weights object into the model, so that we can purturb the model's weights from outside
        weights0 = self.weights
        self.weights = weights1 = weights0.copy()

        # Compute gradient by forward-difference, looping over each individual weight
        neps  = 1e-7
        ngrad = self.make_weights()
        for k in range(len(weights1)):
            w1 = weights1[k]; wg = ngrad[k]
            for i in range(len(w1)):
                # temporarily perturb parameter w[i] by 'neps' and evaluate the new loss
                temp = w1[i]; 
                w1[i] -= neps; c0,l0,r0,p0 = self.cost(data_subset); w1[i] = temp
                w1[i] += neps; c1,l1,r1,p1 = self.cost(data_subset); w1[i] = temp
                wg[i] = (c1-c0)/(2*neps)

        self.weights = weights0  # restore the original weights object for the model

        # Compute backprop's gradient (bgrad), keeping loss/regularizer/penalty separate
        bgrad = self.grad(data_subset)

        A = ngrad.ravel()
        B = bgrad.ravel()
        aerr = np.abs(A-B)
        rerr = self.relative_error(A,B,1e-20)
        print 'absolute_error in gradient: min =', np.min(aerr), 'max =', np.max(aerr)
        print 'relative_error in gradient: min =', np.min(rerr), 'max =', np.max(rerr)