예제 #1
0
    def backward(self, y_pred, y_train, cache):
        X, h1_cache, h2_cache, score_cache, nl_cache1, nl_cache2, u1, u2, bn1_cache, bn2_cache = cache

        # Output layer
        grad_y = self.dloss_funs[self.loss](y_pred, y_train)

        # Third layer
        dh2, dW3, db3 = l.fc_backward(grad_y, score_cache)
        dW3 += reg.dl2_reg(self.model['W3'], self.lam)
        dh2 = self.backward_nonlin(dh2, nl_cache2)
        dh2 = l.dropout_backward(dh2, u2)
        dh2, dgamma2, dbeta2 = l.bn_backward(dh2, bn2_cache)

        # Second layer
        dh1, dW2, db2 = l.fc_backward(dh2, h2_cache)
        dW2 += reg.dl2_reg(self.model['W2'], self.lam)
        dh1 = self.backward_nonlin(dh1, nl_cache1)
        dh1 = l.dropout_backward(dh1, u1)
        dh1, dgamma1, dbeta1 = l.bn_backward(dh1, bn1_cache)

        # First layer
        _, dW1, db1 = l.fc_backward(dh1, h1_cache)
        dW1 += reg.dl2_reg(self.model['W1'], self.lam)

        grad = dict(
            W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3, gamma1=dgamma1,
            gamma2=dgamma2, beta1=dbeta1, beta2=dbeta2
        )

        return grad
예제 #2
0
    def backward(self, y_pred, y_train, dh_next, cache):
        X, X_prime, h_old, hz, hz_cache, hz_sigm_cache, hr, hr_cache, hr_sigm_cache, hh, hh_cache, hh_tanh_cache, h, y_cache = cache

        dy = loss_fun.dcross_entropy(y_pred, y_train)

        dh, dWy, dby = l.fc_backward(dy, y_cache)
        dh += dh_next

        dhh = hz * dh
        dh_old1 = (1. - hz) * dh
        dhz = hh * dh - h_old * dh

        dhh = l.tanh_backward(dhh, hh_tanh_cache)
        dX_prime, dWh, dbh = l.fc_backward(dhh, hh_cache)

        dh_prime = dX_prime[:, :self.H]
        dh_old2 = hr * dh_prime

        dhr = h_old * dh_prime
        dhr = l.sigmoid_backward(dhr, hr_sigm_cache)
        dXr, dWr, dbr = l.fc_backward(dhr, hr_cache)

        dhz = l.sigmoid_backward(dhz, hz_sigm_cache)
        dXz, dWz, dbz = l.fc_backward(dhz, hz_cache)

        dX = dXr + dXz
        dh_old3 = dX[:, :self.H]

        dh_next = dh_old1 + dh_old2 + dh_old3

        grad = dict(Wz=dWz, Wr=dWr, Wh=dWh, Wy=dWy, bz=dbz, br=dbr, bh=dbh, by=dby)

        return grad, dh_next
예제 #3
0
    def backward(self, y_pred, y_train, cache):
        X, h1_cache, h3_cache, score_cache, hpool_cache, hpool, nl_cache1, nl_cache3 = cache

        # Output layer
        grad_y = self.dloss_funs[self.loss](y_pred, y_train)

        # FC-7
        dh3, dW3, db3 = l.fc_backward(grad_y, score_cache)
        dh3 = self.backward_nonlin(dh3, nl_cache3)

        dh2, dW2, db2 = l.fc_backward(dh3, h3_cache)
        dh2 = dh2.ravel().reshape(hpool.shape)

        # Pool-1
        dpool = l.maxpool_backward(dh2, hpool_cache)

        # Conv-1
        dh1 = self.backward_nonlin(dpool, nl_cache1)
        dX, dW1, db1 = l.conv_backward(dh1, h1_cache)

        grad = dict(
            W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3
        )

        return grad
예제 #4
0
    def backward(self, y_pred, y_train, cache):
        X, h1_cache, h3_cache, score_cache, hpool_cache, hpool, nl_cache1, nl_cache3 = cache

        # Output layer
        grad_y = self.dloss_funs[self.loss](y_pred, y_train)

        # FC-7
        dh3, dW3, db3 = l.fc_backward(grad_y, score_cache)
        dh3 = self.backward_nonlin(dh3, nl_cache3)

        dh2, dW2, db2 = l.fc_backward(dh3, h3_cache)
        dh2 = dh2.ravel().reshape(hpool.shape)

        # Pool-1
        dpool = l.maxpool_backward(dh2, hpool_cache)

        # Conv-1
        dh1 = self.backward_nonlin(dpool, nl_cache1)
        dX, dW1, db1 = l.conv_backward(dh1, h1_cache)

        grad = dict(
            W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3
        )

        return grad
예제 #5
0
    def backward(self, y_pred, y_train, dh_next, cache):
        X, X_prime, h_old, hz, hz_cache, hz_sigm_cache, hr, hr_cache, hr_sigm_cache, hh, hh_cache, hh_tanh_cache, h, y_cache = cache

        dy = loss_fun.dcross_entropy(y_pred, y_train)

        dh, dWy, dby = l.fc_backward(dy, y_cache)
        dh += dh_next

        dhh = hz * dh
        dh_old1 = (1. - hz) * dh
        dhz = hh * dh - h_old * dh

        dhh = l.tanh_backward(dhh, hh_tanh_cache)
        dX_prime, dWh, dbh = l.fc_backward(dhh, hh_cache)

        dh_prime = dX_prime[:, :self.H]
        dh_old2 = hr * dh_prime

        dhr = h_old * dh_prime
        dhr = l.sigmoid_backward(dhr, hr_sigm_cache)
        dXr, dWr, dbr = l.fc_backward(dhr, hr_cache)

        dhz = l.sigmoid_backward(dhz, hz_sigm_cache)
        dXz, dWz, dbz = l.fc_backward(dhz, hz_cache)

        dX = dXr + dXz
        dh_old3 = dX[:, :self.H]

        dh_next = dh_old1 + dh_old2 + dh_old3

        grad = dict(Wz=dWz, Wr=dWr, Wh=dWh, Wy=dWy, bz=dbz, br=dbr, bh=dbh, by=dby)

        return grad, dh_next
예제 #6
0
    def backward(self, y_pred, y_train, cache):
        X, h1_cache, h2_cache, score_cache, nl_cache1, nl_cache2, u1, u2, bn1_cache, bn2_cache = cache

        # Output layer
        grad_y = self.dloss_funs[self.loss](y_pred, y_train)

        # Third layer
        dh2, dW3, db3 = l.fc_backward(grad_y, score_cache)
        dW3 += reg.dl2_reg(self.model['W3'], self.lam)
        dh2 = self.backward_nonlin(dh2, nl_cache2)
        dh2 = l.dropout_backward(dh2, u2)
        dh2, dgamma2, dbeta2 = l.bn_backward(dh2, bn2_cache)

        # Second layer
        dh1, dW2, db2 = l.fc_backward(dh2, h2_cache)
        dW2 += reg.dl2_reg(self.model['W2'], self.lam)
        dh1 = self.backward_nonlin(dh1, nl_cache1)
        dh1 = l.dropout_backward(dh1, u1)
        dh1, dgamma1, dbeta1 = l.bn_backward(dh1, bn1_cache)

        # First layer
        _, dW1, db1 = l.fc_backward(dh1, h1_cache)
        dW1 += reg.dl2_reg(self.model['W1'], self.lam)

        grad = dict(
            W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3, gamma1=dgamma1,
            gamma2=dgamma2, beta1=dbeta1, beta2=dbeta2
        )

        return grad
예제 #7
0
    def backward(self, y_pred, y_train, cache):

        (X, h1_cache, h2_cache,  h4_cache, h5_cache, score_cache,
         hpool1_cache, hpool1, hpool2_cache, hpool2,
         nl_cache1, nl_cache2,  nl_cache4, nl_cache5,
         bn4_cache,bn5_cache

        ) = cache

        '''Output layer'''
        grad_y = self.dloss_funs[self.loss](y_pred, y_train)
        dh5, dW6, db6 = l.fc_backward(grad_y, score_cache)

        '''FC-2'''
        dh5 = self.backward_nonlin(dh5, nl_cache5)
        dh5, dgamma5, dbeta5 = l.bn_backward(dh5, bn5_cache)
        dh4, dW5, db5 = l.fc_backward(dh5, h5_cache)


        '''FC -1'''
        dh4 = self.backward_nonlin(dh4, nl_cache4)
        dh4, dgamma4, dbeta4 = l.bn_backward(dh4,bn4_cache)
        dhpool3_, dW4, db4 = l.fc_backward(dh4, h4_cache)

        '''reshape'''
        dhpool3 = dhpool3_.ravel().reshape(hpool2.shape)


        '''Pool -2'''
        dpool2 = l.maxpool_backward(dhpool3, hpool2_cache)

        '''Conv -2'''
        dh2 = self.backward_nonlin(dpool2, nl_cache2)
        dh1, dW2, db2 = l.conv_backward(dh2, h2_cache)

        '''pool -1'''
        dpool1 = l.maxpool_backward(dh1, hpool1_cache)

        '''conv -1'''
        dh1 = self.backward_nonlin(dpool1, nl_cache1)
        dX, dW1, db1 = l.conv_backward(dh1, h1_cache)

        grad = dict(W1=dW1, W2=dW2,  W4=dW4, W5=dW5, W6=dW6,
                    b1=db1, b2=db2,  b4=db4, b5=db5, b6=db6,
                    gamma4=dgamma4,gamma5=dgamma5,
                    beta4=dbeta4,beta5=dbeta5
                )

        return grad
예제 #8
0
    def backward(self, y_pred, y_train, cache):
        
        
        #X, h1_cache, h3_cache, score_cache, hpool_cache, hpool, nl_cache1,nl_cache3,u1,u2,u3,bn1_cache,pool_cache,bn3_cache = cache
        

        X, h1_cache, h3_cache, score_cache, hpool_cache, hpool, nl_cache1, nl_cache3= cache

        # Output layer
        grad_y = self.dloss_funs[self.loss](y_pred, y_train)

        # FC-7
        dh3, dW3, db3 = l.fc_backward(grad_y, score_cache)
        #dW3+=reg.dl2_reg(self.model['W3'],self.lam)
        dh3 = self.backward_nonlin(dh3, nl_cache3)
        #dh3 = l.dropout_backward(dh3,u3)
        #dh3,dgamma3,dbeta3= l.bn_backward(dh3,bn3_cache)
     

        dh2, dW2, db2 = l.fc_backward(dh3, h3_cache)
        #dh2 = l.dropout_backward(dh2,u2)
        dh2 = dh2.ravel().reshape(hpool.shape)
      

        #Pool-1
        #dpool,dgamma2,dbeta2 = l.conv_bn_backward(dh2,pool_cache)
        dpool = l.maxpool_backward(dh2, hpool_cache)
        


        # Conv-1
        dh1 = self.backward_nonlin(dpool, nl_cache1)
        #dX, dW1, db1 = l.conv_backward(dh1, h1_cache)
        #dW1+=reg.dl2_reg(self.model['W1'],self.lam)
        #dh1= l.dropout_backward(dh1,u1)
        #dh1,dgamma1,dbeta1 = l.conv_bn_backward(dh1,bn1_cache)

        dX, dW1, db1 = l.conv_backward(dh1, h1_cache)
        
        
        #grad = dict(W1=dW1, W2=dW2, W3=dW3,b1=db1, b2=db2, b3=db3,gamma1 = dgamma1,beta1 = dbeta1,gamma2 = dgamma2,beta2 = dbeta2,gamma3=dgamma3,beta3=dbeta3)
        
        
        grad = dict(
            W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3

        )
        
        return grad
예제 #9
0
    def backward(self, y_pred, y_train, d_next, cache):
        X, hf, hi, ho, hc, hf_cache, hf_sigm_cache, hi_cache, hi_sigm_cache, ho_cache, ho_sigm_cache, hc_cache, hc_tanh_cache, c_old, c, c_tanh_cache, y_cache = cache
        dh_next, dc_next = d_next

        dy = loss_fun.dcross_entropy(y_pred, y_train)

        dh, dWy, dby = l.fc_backward(dy, y_cache)
        dh += dh_next

        dho = c * dh
        dho = l.sigmoid_backward(dho, ho_sigm_cache)

        dc = ho * dh
        dc = l.tanh_backward(dc, c_tanh_cache)
        dc = dc + dc_next

        dhf = c_old * dc
        dhf = l.sigmoid_backward(dhf, hf_sigm_cache)

        dhi = hc * dc
        dhi = l.sigmoid_backward(dhi, hi_sigm_cache)

        dhc = hi * dc
        dhc = l.tanh_backward(dhc, hc_tanh_cache)

        dXo, dWo, dbo = l.fc_backward(dho, ho_cache)
        dXc, dWc, dbc = l.fc_backward(dhc, hc_cache)
        dXi, dWi, dbi = l.fc_backward(dhi, hi_cache)
        dXf, dWf, dbf = l.fc_backward(dhf, hf_cache)

        dX = dXo + dXc + dXi + dXf
        dh_next = dX[:, :self.H]
        dc_next = hf * dc

        grad = dict(Wf=dWf,
                    Wi=dWi,
                    Wc=dWc,
                    Wo=dWo,
                    Wy=dWy,
                    bf=dbf,
                    bi=dbi,
                    bc=dbc,
                    bo=dbo,
                    by=dby)

        return grad, (dh_next, dc_next)
예제 #10
0
    def backward(self, y_pred, y_train, cache, iter):
        num_layers = self.num_layers
        # Output layer
        grad_y = self.dloss_funs[self.loss](y_pred, y_train)

        # Fourth layer
        dh, dW, db = l.fc_backward(grad_y, cache['score_cache'])
        grad = dict()
        grad['Wf'] = dW + reg.dl2_reg(self.model['Wf'], self.lam)

        grad['bf'] = db

        dprevH = 0
        for i in range(num_layers, 0, -1):
            if self.leapfrog:
                dh, dprevH, dW, db = l.leap_backward(
                    dh, dprevH, cache['h_cache' + str(i)],
                    cache['nl_cache' + str(i)], i == num_layers, self.hypo)
            else:
                dh, dW, db = l.fcrelu_backward(
                    dh,
                    cache['h_cache' + str(i)],
                    cache['nl_cache' + str(i)],
                    antisymmetric=self.antisymmetric,
                    hypo=self.hypo)
            if not self.antisymmetric and not self.leapfrog:
                dW += reg.dl2_reg(self.model['W' + str(i)], self.lam)
            grad['W' + str(i)] = dW
            grad['b' + str(i)] = db
        if self.doDropout:
            dh = l.dropout_backward(dh, cache['u1'])
        dh, dW, db = l.fcrelu_backward(dh,
                                       cache['h_caches'],
                                       cache['nl_caches'],
                                       antisymmetric=self.antisymmetric,
                                       hypo=self.hypo)
        grad['Ws'] = dW + reg.dl2_reg(self.model['Ws'], self.lam)
        grad['bs'] = db
        #dh, dW, db = l.conv_backward(dh, cache['c_cache'])
        #grad['Wc'] = dW
        #grad['bs'] = db
        if self.freezeLastLayer or self.freezeClassificationLayer:
            grad['Wf'] = 0
            grad['bf'] = 0
        if self.weights_fixed:
            grad['Wf'] = 0
            grad['bf'] = 0
            grad['Ws'] = 0
            grad['bs'] = 0

        return grad
예제 #11
0
    def backward(self, y_pred, y_train, d_next, cache):
        X, hf, hi, ho, hc, hf_cache, hf_sigm_cache, hi_cache, hi_sigm_cache, ho_cache, ho_sigm_cache, hc_cache, hc_tanh_cache, c_old, c, c_tanh_cache, y_cache = cache
        dh_next, dc_next = d_next

        dy = loss_fun.dcross_entropy(y_pred, y_train)

        dh, dWy, dby = l.fc_backward(dy, y_cache)
        dh += dh_next

        dho = c * dh
        dho = l.sigmoid_backward(dho, ho_sigm_cache)

        dc = ho * dh
        dc = l.tanh_backward(dc, c_tanh_cache)
        dc = dc + dc_next

        dhf = c_old * dc
        dhf = l.sigmoid_backward(dhf, hf_sigm_cache)

        dhi = hc * dc
        dhi = l.sigmoid_backward(dhi, hi_sigm_cache)

        dhc = hi * dc
        dhc = l.tanh_backward(dhc, hc_tanh_cache)

        dXo, dWo, dbo = l.fc_backward(dho, ho_cache)
        dXc, dWc, dbc = l.fc_backward(dhc, hc_cache)
        dXi, dWi, dbi = l.fc_backward(dhi, hi_cache)
        dXf, dWf, dbf = l.fc_backward(dhf, hf_cache)

        dX = dXo + dXc + dXi + dXf
        dh_next = dX[:, :self.H]
        dc_next = hf * dc

        grad = dict(Wf=dWf, Wi=dWi, Wc=dWc, Wo=dWo, Wy=dWy, bf=dbf, bi=dbi, bc=dbc, bo=dbo, by=dby)

        return grad, (dh_next, dc_next)
예제 #12
0
    def backward(self, y_pred, y_train, dh_next, cache):
        X, Whh, h, hprev, y, h_cache, y_cache = cache

        # Softmax gradient
        dy = loss_fun.dcross_entropy(y_pred, y_train)

        # Hidden to output gradient
        dh, dWhy, dby = l.fc_backward(dy, y_cache)
        dh += dh_next
        dby = dby.reshape((1, -1))

        # tanh
        dh = l.tanh_backward(dh, h_cache)

        # Hidden gradient
        dbh = dh
        dWhh = hprev.T @ dh
        dWxh = X.T @ dh
        dh_next = dh @ Whh.T

        grad = dict(Wxh=dWxh, Whh=dWhh, Why=dWhy, bh=dbh, by=dby)

        return grad, dh_next
예제 #13
0
    def backward(self, y_pred, y_train, dh_next, cache):
        X, Whh, h, hprev, y, h_cache, y_cache = cache

        # Softmax gradient
        dy = loss_fun.dcross_entropy(y_pred, y_train)

        # Hidden to output gradient
        dh, dWhy, dby = l.fc_backward(dy, y_cache)
        dh += dh_next
        dby = dby.reshape((1, -1))

        # tanh
        dh = l.tanh_backward(dh, h_cache)

        # Hidden gradient
        dbh = dh
        dWhh = hprev.T @ dh
        dWxh = X.T @ dh
        dh_next = dh @ Whh.T

        grad = dict(Wxh=dWxh, Whh=dWhh, Why=dWhy, bh=dbh, by=dby)

        return grad, dh_next