示例#1
0
文件: forward.py 项目: idfah/cebl
    def gradient(self, x, g, returnError=True):
        x = np.asarray(x)
        g = np.asarray(g)

        if self.flattenOut:
            g = g.ravel()

        # packed views of the hidden and visible gradient matrices
        views = util.packedViews(self.layerDims, dtype=self.dtype)
        pg = views[0]
        hgs = views[1:-1]
        vg = views[-1]

        # forward pass
        z1 = util.bias(x)
        z1s = [z1]
        zPrimes = []
        for hw, phi in zip(self.hws, self.transFunc):
            h = z1.dot(hw)

            z1 = util.bias(phi(h))
            z1s.append(z1)

            zPrime = phi(h, 1)
            zPrimes.append(zPrime)

        y = z1.dot(self.vw)

        if self.flattenOut:
            y = y.ravel()

        # error components
        e = util.colmat(y - g)
        delta = np.sign(e) / e.size

        # visible layer gradient
        vg[...] = z1.T.dot(delta)
        vg += self.penaltyGradient(-1)

        # backward pass for hidden layers
        w = self.vw
        for l in range(self.nHLayers - 1, -1, -1):
            delta = delta.dot(w[:-1, :].T) * zPrimes[l]
            hgs[l][...] = z1s[l].T.dot(delta)
            hgs[l] += self.penaltyGradient(l)
            w = self.hws[l]

        if returnError:
            error = np.mean(np.abs(e)) + self.penaltyError()
            return error, pg
        else:
            return pg
示例#2
0
    def __init__(
            self,
            x,
            g,
            nHidden=10,
            transient=0,
            phi=transfer.tanh,
            #iwInitFunc=pinit.lecun, rwInitFunc=pinit.lecun,
            hwInitFunc=pinit.esp,
            vwInitFunc=pinit.lecun,
            optimFunc=optim.scg,
            **kwargs):
        x = util.segmat(x)
        g = util.segmat(g)  # flattenOut? XXX - idfah

        self.dtype = np.result_type(x.dtype, g.dtype)

        Regression.__init__(self, x.shape[2], g.shape[2])
        optim.Optable.__init__(self)

        self.nHidden = nHidden
        self.transient = transient
        self.phi = phi

        self.pw, self.hw, self.vw = \
            util.packedViews(((self.nIn+self.nHidden+1, self.nHidden),
                              (self.nHidden+1, self.nOut)),
                             dtype=self.dtype)

        self.iw = self.hw[:(self.nIn + 1)]
        self.rw = self.hw[(self.nIn + 1):]

        # initialize weights
        #self.iw[...] = iwInitFunc(self.iw.shape).astype(self.dtype, copy=False)
        #self.rw[...] = rwInitFunc(self.rw.shape).astype(self.dtype, copy=False)
        self.hw[...] = hwInitFunc(self.hw.shape).astype(self.dtype, copy=False)
        self.vw[...] = vwInitFunc(self.vw.shape).astype(self.dtype, copy=False)

        # train the network
        if optimFunc is not None:
            self.train(x, g, optimFunc, **kwargs)
示例#3
0
    def gradient(self, x, g, unrollSteps=10, returnError=True):
        x = util.segmat(x)
        g = util.segmat(g)

        # packed views of the hidden and visible gradient matrices
        pg, hg, vg = util.packedViews((self.hw.shape, self.vw.shape),
                                      dtype=self.dtype)

        x1 = util.bias(x)

        nSeg = x1.shape[0]
        nObs = x1.shape[1]
        nIn1 = x1.shape[2]

        h = np.empty((nSeg, nObs, self.nHidden), dtype=self.dtype)
        r = np.empty((nSeg, nObs, self.nHidden), dtype=self.dtype)
        x1c = np.empty((nSeg, nObs, nIn1 + self.nHidden), dtype=self.dtype)
        context = np.zeros((nSeg, self.nHidden), dtype=self.dtype)

        for t in range(nObs):
            x1c[:, t, :nIn1] = x1[:, t]
            x1c[:, t, nIn1:] = context

            h[:, t] = x1c[:, t].dot(self.hw)
            r[:, t] = self.phi(h[:, t])
            context[...] = r[:, t]

        r1 = util.bias(r)
        y = r1.dot(self.vw)
        rPrime = self.phi(h, 1)

        # error components, ditch transient
        e = (y - g)[:, self.transient:]
        delta = np.zeros(g.shape, dtype=self.dtype)
        delta[:, self.transient:] = 2.0 * e / e.size

        # visible layer gradient
        r1f = r1.reshape((-1, r1.shape[-1]))
        deltaf = delta.reshape((-1, delta.shape[-1]))
        vg[...] = r1f.T.dot(deltaf)

        vwDelta = delta.dot(self.vw[:-1].T)

        gamma = np.zeros((nSeg, unrollSteps, self.nHidden), dtype=self.dtype)
        #delta = np.zeros((nSeg, nObs-self.transient, self.nHidden), dtype=self.dtype)
        delta = np.zeros((nSeg, nObs, self.nHidden), dtype=self.dtype)

        ##hg[...] = 0.0

        # backward pass for hidden layer, unrolled through time
        #for t in range(nObs-self.transient-1, 0, -1):
        for t in range(nObs - 1, 0, -1):
            rPrimet = rPrime[:, t][:, None, :]
            #x1ct = x1c[:,t][:,None,:]
            ##x1ct = x1c[:,t]

            beta = gamma[:, :-1]
            beta = beta.dot(self.rw.T)

            gamma[:, 0] = vwDelta[:, t]
            gamma[:, 1:] = beta
            gamma *= rPrimet

            ##x1ctf = np.tile(x1ct, unrollSteps).reshape((-1, x1ct.shape[-1]))
            ##gammaf = gamma.reshape((-1, gamma.shape[-1]))
            delta[:, t] = gamma.sum(axis=1)

            #hg += x1ctf.T.dot(gammaf)
            ##hg += x1ct.T.dot(gamma.sum(axis=1))

            ##hg += x1ct.T.dot(gamma.swapaxes(0,1)).sum(axis=1)

        x1cf = x1c.reshape((-1, x1c.shape[-1]))
        deltaf = delta.reshape((-1, delta.shape[-1]))
        #hg[...] = x1c.reshape((-1, x1c.shape[-1])).T.dot(delta.reshape((-1, d.shape[-1])))
        hg[...] = x1cf.T.dot(deltaf)

        if returnError:
            return np.mean(e**2), pg
        else:
            return pg
示例#4
0
文件: convreg.py 项目: idfah/cebl
    def __init__(self,
                 x,
                 g,
                 convs=((8, 16), (16, 8)),
                 nHidden=None,
                 transFunc=transfer.lecun,
                 weightInitFunc=pinit.lecun,
                 penalty=None,
                 elastic=1.0,
                 optimFunc=optim.scg,
                 **kwargs):
        x = util.segmat(x)
        g = util.segmat(g)
        self.dtype = np.result_type(x.dtype, g.dtype)

        Regression.__init__(self, x.shape[2], g.shape[2])
        optim.Optable.__init__(self)

        self.nConvHiddens, self.convWidths = zip(*convs)
        self.nConvLayers = len(convs)
        self.nHidden = nHidden

        self.layerDims = [
            (self.nIn * self.convWidths[0] + 1, self.nConvHiddens[0]),
        ]
        for l in range(1, self.nConvLayers):
            ni = self.nConvHiddens[l - 1] * self.convWidths[l] + 1
            no = self.nConvHiddens[l]
            self.layerDims.append((ni, no))

        if self.nHidden is None:
            self.layerDims.append((self.nConvHiddens[-1] + 1, self.nOut))
        else:
            self.layerDims.append((self.nConvHiddens[-1] + 1, self.nHidden))
            self.layerDims.append((self.nHidden + 1, self.nOut))

        self.transFunc = transFunc if util.isiterable(transFunc) \
                else (transFunc,) * (len(self.layerDims)-1)
        assert len(self.transFunc) == (len(self.layerDims) - 1)

        views = util.packedViews(self.layerDims, dtype=self.dtype)
        self.pw = views[0]

        if self.nHidden is None:
            self.cws = views[1:-1]
            self.hw = None
            self.vw = views[-1]
        else:
            self.cws = views[1:-2]
            self.hw = views[-2]
            self.vw = views[-1]

        if not util.isiterable(weightInitFunc):
            weightInitFunc = (weightInitFunc, ) * (self.nConvLayers + 2)
        assert len(weightInitFunc) == (len(self.cws) + 2)

        self.penalty = penalty
        if self.penalty is not None:
            if not util.isiterable(self.penalty):
                self.penalty = (self.penalty, ) * (self.nConvLayers + 2)
        assert (self.penalty is None) or (len(self.penalty)
                                          == (len(self.cws) + 2))

        self.elastic = elastic if util.isiterable(elastic) \
                else (elastic,) * (self.nConvLayers+2)
        assert (len(self.elastic) == (len(self.cws) + 2))

        # initialize weights
        for cw, wif in zip(self.cws, weightInitFunc):
            cw[...] = wif(cw.shape).astype(self.dtype, copy=False)

        if self.nHidden is not None:
            self.hw[...] = weightInitFunc[-2](self.hw.shape).astype(self.dtype,
                                                                    copy=False)

        self.vw[...] = weightInitFunc[-1](self.vw.shape).astype(self.dtype,
                                                                copy=False)

        # train the network
        if optimFunc is not None:
            self.train(x, g, optimFunc, **kwargs)
示例#5
0
文件: convreg.py 项目: idfah/cebl
    def gradient(self, x, g, returnError=True):
        x = util.segmat(x)
        g = util.colmat(g)

        # packed views of the hidden and visible gradient matrices
        views = util.packedViews(self.layerDims, dtype=self.dtype)
        pg = views[0]

        if self.nHidden is None:
            cgs = views[1:-1]
            hg = None
            vg = views[-1]
        else:
            cgs = views[1:-2]
            hg = views[-2]
            vg = views[-1]

        # forward pass
        c = x
        c1s = []
        cPrimes = []
        for l, cw in enumerate(self.cws):
            width = self.convWidths[l]
            phi = self.transFunc[l]

            c = util.timeEmbed(c, lags=width - 1, axis=1)

            c1 = util.bias(c)
            c1s.append(c1)

            h = util.segdot(c1, cw)
            cPrime = phi(h, 1)
            cPrimes.append(cPrime)

            c = phi(h)

        c1 = util.bias(c)

        # evaluate hidden and visible layers
        if self.nHidden is None:
            y = util.segdot(c1, self.vw)
        else:
            h = util.segdot(c1, self.hw)
            z1 = util.bias(self.transFunc[-1](h))
            zPrime = self.transFunc[-1](h, 1)
            y = util.segdot(z1, self.vw)

        # error components
        trim = (g.shape[1] - y.shape[1]) // 2
        gTrim = g[:, :(g.shape[1] - trim)]
        gTrim = gTrim[:, -y.shape[1]:]

        # error components
        e = util.colmat(y - gTrim)
        delta = 2.0 * e / e.size

        if self.nHidden is None:
            # visible layer gradient
            c1f = c1.reshape((-1, c1.shape[-1]))
            deltaf = delta.reshape((-1, delta.shape[-1]))
            vg[...] = c1f.T.dot(deltaf)
            vg += self.penaltyGradient(-1)

            delta = util.segdot(delta, self.vw[:-1].T)

        else:
            # visible layer gradient
            z1f = z1.reshape((-1, z1.shape[-1]))
            deltaf = delta.reshape((-1, delta.shape[-1]))
            vg[...] = z1f.T.dot(deltaf)
            vg += self.penaltyGradient(-1)

            # hidden layer gradient
            c1f = c1.reshape((-1, c1.shape[-1]))
            delta = util.segdot(delta, self.vw[:-1].T) * zPrime
            deltaf = delta.reshape((-1, delta.shape[-1]))
            hg[...] = c1f.T.dot(deltaf)
            hg += self.penaltyGradient(-2)

            delta = util.segdot(delta, self.hw[:-1].T)

        # backward pass for convolutional layers
        for l in range(self.nConvLayers - 1, -1, -1):
            c1 = c1s[l]
            cPrime = cPrimes[l]

            delta = delta[:, :cPrime.shape[1]] * cPrime

            c1f = c1.reshape((-1, c1.shape[-1]))
            deltaf = delta.reshape((-1, delta.shape[-1]))
            cgs[l][...] = c1f.T.dot(deltaf)
            cgs[l] += self.penaltyGradient(l)

            if l > 0:  # won't propigate back to inputs
                delta = util.segdot(delta, self.cws[l][:-1].T)
                delta = deltaDeEmbedSum(delta, self.convWidths[l])

        if returnError:
            error = np.mean(e**2) + self.penaltyError()
            return error, pg
        else:
            return pg
示例#6
0
    def __init__(
            self,
            x,
            g,
            recs=(8, 4, 2),
            transient=0,
            phi=transfer.tanh,
            #iwInitFunc=pinit.lecun, rwInitFunc=pinit.lecun,
            hwInitFunc=pinit.esp,
            vwInitFunc=pinit.lecun,
            optimFunc=optim.scg,
            **kwargs):
        x = util.segmat(x)
        g = util.segmat(g)

        self.dtype = np.result_type(x.dtype, g.dtype)

        Regression.__init__(self, x.shape[2], g.shape[2])
        optim.Optable.__init__(self)

        self.transient = transient
        self.phi = phi

        self.nRecHiddens = list(recs)
        self.nRecLayers = len(self.nRecHiddens)

        self.layerDims = [(self.nIn + self.nRecHiddens[0] + 1,
                           self.nRecHiddens[0])]
        for l in range(1, self.nRecLayers):
            self.layerDims.append(
                (self.nRecHiddens[l - 1] + self.nRecHiddens[l] + 1,
                 self.nRecHiddens[l]))
        self.layerDims.append((self.nRecHiddens[-1] + 1, self.nOut))

        views = util.packedViews(self.layerDims, dtype=self.dtype)
        self.pw = views[0]
        self.hws = views[1:-1]
        self.vw = views[-1]

        self.iws = []
        self.rws = []
        nIn = self.nIn
        for l in range(self.nRecLayers):
            iw = self.hws[l][:(nIn + 1)]
            rw = self.hws[l][(nIn + 1):]
            self.iws.append(iw)
            self.rws.append(rw)

            #self.iws[l][...] = iwInitFunc(iw.shape).astype(self.dtype, copy=False)
            #self.rws[l][...] = rwInitFunc(rw.shape).astype(self.dtype, copy=False)

            nIn = self.nRecHiddens[l]

            self.hws[l][...] = hwInitFunc(self.hws[l].shape).astype(self.dtype,
                                                                    copy=False)

        self.vw[...] = vwInitFunc(self.vw.shape).astype(self.dtype, copy=False)

        # train the network
        if optimFunc is not None:
            self.train(x, g, optimFunc, **kwargs)
示例#7
0
    def gradient(self, x, g, unrollSteps=10, returnError=True):
        x = util.segmat(x)
        g = util.segmat(g)

        if isinstance(unrollSteps, (int, )):
            unrollSteps = [
                unrollSteps,
            ] * self.nRecLayers

        views = util.packedViews(self.layerDims, dtype=self.dtype)
        pg = views[0]
        hgs = views[1:-1]
        vg = views[-1]

        x1 = util.bias(x)

        nSeg = x1.shape[0]
        nObs = x1.shape[1]

        r1Prev = x1
        r1cs = []
        rPrimes = []

        for l in range(self.nRecLayers):
            nIn1 = r1Prev.shape[2]

            r = np.empty((nSeg, nObs, self.nRecHiddens[l]), dtype=self.dtype)
            h = np.empty((nSeg, nObs, self.nRecHiddens[l]), dtype=self.dtype)
            r1c = np.empty((nSeg, nObs, nIn1 + self.nRecHiddens[l]),
                           dtype=self.dtype)
            context = np.zeros((nSeg, self.nRecHiddens[l]), dtype=self.dtype)

            for t in range(nObs):
                r1c[:, t, :nIn1] = r1Prev[:, t]
                r1c[:, t, nIn1:] = context

                h[:, t] = r1c[:, t].dot(self.hws[l])
                r[:, t] = self.phi(h[:, t])
                context[...] = r[:, t]

            r1Prev = util.bias(r)
            r1cs.append(r1c)

            rPrime = self.phi(h, 1)
            rPrimes.append(rPrime)

        # evaluate visible layer
        r1 = r1Prev
        y = r1.dot(self.vw)

        # error components, ditch transient
        e = (y - g)[:, self.transient:]
        delta = np.zeros(g.shape, dtype=self.dtype)
        delta[:, self.transient:] = 2.0 * e / e.size

        # visible layer gradient
        r1f = r1.reshape((-1, r1.shape[-1]))
        deltaf = delta.reshape((-1, delta.shape[-1]))
        vg[...] = r1f.T.dot(deltaf)

        # backward pass through each layer
        w = self.vw
        for l in range(self.nRecLayers - 1, -1, -1):
            r1c = r1cs[l]
            rwsTrans = self.rws[l].T
            rPrime = rPrimes[l]

            deltaPrev = delta.dot(w[:-1].T)

            gamma = np.zeros((nSeg, unrollSteps[l], self.nRecHiddens[l]),
                             dtype=self.dtype)
            #delta = np.zeros((nSeg, nObs-self.transient, self.nRecHiddens[l]), dtype=self.dtype)
            delta = np.zeros((nSeg, nObs, self.nRecHiddens[l]),
                             dtype=self.dtype)

            # unrolled through time
            #for t in range(nObs-self.transient-1, 0, -1):
            for t in range(nObs - 1, 0, -1):
                rPrimet = rPrime[:, t][:, None, :]

                beta = gamma[:, :-1]
                beta = beta.dot(rwsTrans)

                gamma[:, 0] = deltaPrev[:, t]
                gamma[:, 1:] = beta
                gamma *= rPrimet

                delta[:, t] = gamma.sum(axis=1)

            r1cf = r1c.reshape((-1, r1c.shape[-1]))
            deltaf = delta.reshape((-1, delta.shape[-1]))
            hgs[l][...] = r1cf.T.dot(deltaf)

            #print('hg %d: %f' % (l, np.sqrt(np.mean(hgs[l]**2))))

            w = self.iws[l]

        if returnError:
            return np.mean(e**2), pg
        else:
            return pg
示例#8
0
文件: softmax.py 项目: idfah/cebl
    def gradient(self, x, g, returnError=True):
        """Compute the gradient of the mean-squared error with respect to the
        network weights for each layer and given inputs and targets.  Useful
        for optimization routines that make use of first-order gradients.

        Args:
            x:

            g:

            returnError:    If True (default) then also return the
                            mean-squared error.  This can improve
                            performance in some optimization routines
                            by avoiding an additional forward pass.

        Returns:
            If returnError is True, then return a tuple containing
            the error followed by a 1d numpy array containing the
            gradient of the packed weights.  If returnError is False,
            then only return the gradient.
        """
        x = np.asarray(x)
        g = np.asarray(g)

        # packed views of the hidden and visible gradient matrices
        views = util.packedViews(self.layerDims, dtype=self.dtype)
        pg  = views[0]
        hgs = views[1:-1]
        vg  = views[-1]

        # forward pass
        z1 = util.bias(x)
        z1s = [z1]
        zPrimes = []
        for hw, phi in zip(self.hws, self.transFunc):
            h = z1.dot(hw)

            z1 = util.bias(phi(h))
            z1s.append(z1)

            zPrime = phi(h, 1)
            zPrimes.append(zPrime)

        v = z1.dot(self.vw)
        probs = util.softmax(v)

        # error components
        delta = util.colmat(probs - g) / probs.size

        # visible layer gradient
        vg[...] = z1.T.dot(delta)
        vg += self.penaltyGradient(-1)

        # backward pass for hidden layers
        w = self.vw
        for l in range(self.nHLayers-1, -1, -1):
            delta = delta.dot(w[:-1,:].T) * zPrimes[l]
            hgs[l][...] = z1s[l].T.dot(delta)
            hgs[l] += self.penaltyGradient(l)
            w = self.hws[l]

        if returnError:
            error = -np.mean(g*np.log(util.capZero(probs))) + self.penaltyError()
            return error, pg
        else:
            return pg
示例#9
0
文件: softmax.py 项目: idfah/cebl
    def __init__(self, classData, nHidden=10, transFunc=transfer.lecun,
                 weightInitFunc=pinit.lecun, penalty=None, elastic=1.0,
                 optimFunc=optim.scg, **kwargs):
        """Construct a new feedforward neural network.

        Args:
            classData:      Training data.  This is a numpy array or list of numpy
                            arrays with shape (nCls,nObs[,nIn]).  If the
                            dimensions index is missing the data is assumed to be
                            one-dimensional.

            nHidden:        Number of hidden units.

            transFunc:      Hidden layer transfer function.  The default is
                            transfer.lecun.  See the transfer module for more.

            weightInitFunc: Function to initialize the weights in each layer.
                            If a single function is given, it will be repeated
                            for each layer with the visible layer last.  The
                            default function is the lecun function in the
                            paraminit module.  See the paraminit module
                            for more choices.

            penalty:

            elastic:       Penalty or weight decay.  The cost function is
                            then e + p * ||W||_2 where e is the prediction
                            error, p is the penalty and ||W||_2 denotes the
                            l2 norm of the weight matrix.  This regularizes
                            the network by pulling weights toward zero and
                            toward each other.
                                    1.0 is pure L2-norm
                                         --> between is elastic net
                                    0.0 is pure L1-norm

            optimFunc:      Function used to optimize the weight matrices.
                            If None, initial training will be skipped.
                            See ml.optim for some candidate optimization
                            functions.

            kwargs:         Additional arguments passed to optimFunc.

        Returns:
            A new, trained feedforward network.

        Refs:
            @incollection{lecun2012efficient,
              title={Efficient backprop},
              author={LeCun, Yann A and Bottou, L{\'e}on and Orr, Genevieve B and M{\"u}ller, Klaus-Robert},
              booktitle={Neural networks: Tricks of the trade},
              pages={9--48},
              year={2012},
              publisher={Springer}
            }
        """
        Classifier.__init__(self, util.colmat(classData[0]).shape[1],
                            len(classData))
        optim.Optable.__init__(self)

        self.dtype = np.result_type(*[cls.dtype for cls in classData])

        self.nHidden = nHidden if util.isiterable(nHidden) else (nHidden,)
        self.nHLayers = len(self.nHidden)

        self.layerDims = [(self.nIn+1, self.nHidden[0])]
        for l in range(1, self.nHLayers):
            self.layerDims.append((self.nHidden[l-1]+1, self.nHidden[l]))
        self.layerDims.append((self.nHidden[-1]+1, self.nCls))

        self.transFunc = transFunc if util.isiterable(transFunc) \
                else (transFunc,) * self.nHLayers
        assert len(self.transFunc) == self.nHLayers

        views = util.packedViews(self.layerDims, dtype=self.dtype)
        self.pw  = views[0]
        self.hws = views[1:-1]
        self.vw  = views[-1]

        if not util.isiterable(weightInitFunc):
            weightInitFunc = (weightInitFunc,) * (self.nHLayers+1)
        assert len(weightInitFunc) == (len(self.hws) + 1)

        # initialize weights
        for hw, wif in zip(self.hws, weightInitFunc):
            hw[...] = wif(hw.shape).astype(self.dtype, copy=False)
        self.vw[...] = weightInitFunc[-1](self.vw.shape).astype(self.dtype, copy=False)

        self.penalty = penalty
        if self.penalty is not None:
            if not util.isiterable(self.penalty):
                self.penalty = (self.penalty,) * (self.nHLayers+1)
        assert (self.penalty is None) or (len(self.penalty) == (len(self.hws) + 1))

        self.elastic = elastic if util.isiterable(elastic) \
                else (elastic,) * (self.nHLayers+1)
        assert (len(self.elastic) == (len(self.hws) + 1))

        # train the network
        if optimFunc is not None:
            self.train(classData, optimFunc, **kwargs)
示例#10
0
文件: forward.py 项目: idfah/cebl
    def __init__(self,
                 x,
                 g,
                 nHidden=10,
                 transFunc=transfer.lecun,
                 weightInitFunc=pinit.lecun,
                 penalty=None,
                 elastic=1.0,
                 optimFunc=optim.scg,
                 **kwargs):
        x = np.asarray(x)
        g = np.asarray(g)
        self.dtype = np.result_type(x.dtype, g.dtype)

        self.flattenOut = False if g.ndim > 1 else True

        Regression.__init__(self,
                            util.colmat(x).shape[1],
                            util.colmat(g).shape[1])
        optim.Optable.__init__(self)

        self.nHidden = nHidden if util.isiterable(nHidden) else (nHidden, )
        self.nHLayers = len(self.nHidden)

        self.layerDims = [(self.nIn + 1, self.nHidden[0])]
        for l in range(1, self.nHLayers):
            self.layerDims.append((self.nHidden[l - 1] + 1, self.nHidden[l]))
        self.layerDims.append((self.nHidden[-1] + 1, self.nOut))

        self.transFunc = transFunc if util.isiterable(transFunc) \
                else (transFunc,) * self.nHLayers
        assert len(self.transFunc) == self.nHLayers

        views = util.packedViews(self.layerDims, dtype=self.dtype)
        self.pw = views[0]
        self.hws = views[1:-1]
        self.vw = views[-1]

        if not util.isiterable(weightInitFunc):
            weightInitFunc = (weightInitFunc, ) * (self.nHLayers + 1)
        assert len(weightInitFunc) == (len(self.hws) + 1)

        # initialize weights
        for hw, wif in zip(self.hws, weightInitFunc):
            hw[...] = wif(hw.shape).astype(self.dtype, copy=False)
        self.vw[...] = weightInitFunc[-1](self.vw.shape).astype(self.dtype,
                                                                copy=False)

        self.penalty = penalty
        if self.penalty is not None:
            if not util.isiterable(self.penalty):
                self.penalty = (self.penalty, ) * (self.nHLayers + 1)
        assert (self.penalty is None) or (len(self.penalty)
                                          == (len(self.hws) + 1))

        self.elastic = elastic if util.isiterable(elastic) \
                else (elastic,) * (self.nHLayers+1)
        assert (len(self.elastic) == (len(self.hws) + 1))

        # train the network
        if optimFunc is not None:
            self.train(x, g, optimFunc, **kwargs)