def gradient(self, x, g, returnError=True): x = np.asarray(x) g = np.asarray(g) if self.flattenOut: g = g.ravel() # packed views of the hidden and visible gradient matrices views = util.packedViews(self.layerDims, dtype=self.dtype) pg = views[0] hgs = views[1:-1] vg = views[-1] # forward pass z1 = util.bias(x) z1s = [z1] zPrimes = [] for hw, phi in zip(self.hws, self.transFunc): h = z1.dot(hw) z1 = util.bias(phi(h)) z1s.append(z1) zPrime = phi(h, 1) zPrimes.append(zPrime) y = z1.dot(self.vw) if self.flattenOut: y = y.ravel() # error components e = util.colmat(y - g) delta = np.sign(e) / e.size # visible layer gradient vg[...] = z1.T.dot(delta) vg += self.penaltyGradient(-1) # backward pass for hidden layers w = self.vw for l in range(self.nHLayers - 1, -1, -1): delta = delta.dot(w[:-1, :].T) * zPrimes[l] hgs[l][...] = z1s[l].T.dot(delta) hgs[l] += self.penaltyGradient(l) w = self.hws[l] if returnError: error = np.mean(np.abs(e)) + self.penaltyError() return error, pg else: return pg
def __init__( self, x, g, nHidden=10, transient=0, phi=transfer.tanh, #iwInitFunc=pinit.lecun, rwInitFunc=pinit.lecun, hwInitFunc=pinit.esp, vwInitFunc=pinit.lecun, optimFunc=optim.scg, **kwargs): x = util.segmat(x) g = util.segmat(g) # flattenOut? XXX - idfah self.dtype = np.result_type(x.dtype, g.dtype) Regression.__init__(self, x.shape[2], g.shape[2]) optim.Optable.__init__(self) self.nHidden = nHidden self.transient = transient self.phi = phi self.pw, self.hw, self.vw = \ util.packedViews(((self.nIn+self.nHidden+1, self.nHidden), (self.nHidden+1, self.nOut)), dtype=self.dtype) self.iw = self.hw[:(self.nIn + 1)] self.rw = self.hw[(self.nIn + 1):] # initialize weights #self.iw[...] = iwInitFunc(self.iw.shape).astype(self.dtype, copy=False) #self.rw[...] = rwInitFunc(self.rw.shape).astype(self.dtype, copy=False) self.hw[...] = hwInitFunc(self.hw.shape).astype(self.dtype, copy=False) self.vw[...] = vwInitFunc(self.vw.shape).astype(self.dtype, copy=False) # train the network if optimFunc is not None: self.train(x, g, optimFunc, **kwargs)
def gradient(self, x, g, unrollSteps=10, returnError=True): x = util.segmat(x) g = util.segmat(g) # packed views of the hidden and visible gradient matrices pg, hg, vg = util.packedViews((self.hw.shape, self.vw.shape), dtype=self.dtype) x1 = util.bias(x) nSeg = x1.shape[0] nObs = x1.shape[1] nIn1 = x1.shape[2] h = np.empty((nSeg, nObs, self.nHidden), dtype=self.dtype) r = np.empty((nSeg, nObs, self.nHidden), dtype=self.dtype) x1c = np.empty((nSeg, nObs, nIn1 + self.nHidden), dtype=self.dtype) context = np.zeros((nSeg, self.nHidden), dtype=self.dtype) for t in range(nObs): x1c[:, t, :nIn1] = x1[:, t] x1c[:, t, nIn1:] = context h[:, t] = x1c[:, t].dot(self.hw) r[:, t] = self.phi(h[:, t]) context[...] = r[:, t] r1 = util.bias(r) y = r1.dot(self.vw) rPrime = self.phi(h, 1) # error components, ditch transient e = (y - g)[:, self.transient:] delta = np.zeros(g.shape, dtype=self.dtype) delta[:, self.transient:] = 2.0 * e / e.size # visible layer gradient r1f = r1.reshape((-1, r1.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) vg[...] = r1f.T.dot(deltaf) vwDelta = delta.dot(self.vw[:-1].T) gamma = np.zeros((nSeg, unrollSteps, self.nHidden), dtype=self.dtype) #delta = np.zeros((nSeg, nObs-self.transient, self.nHidden), dtype=self.dtype) delta = np.zeros((nSeg, nObs, self.nHidden), dtype=self.dtype) ##hg[...] = 0.0 # backward pass for hidden layer, unrolled through time #for t in range(nObs-self.transient-1, 0, -1): for t in range(nObs - 1, 0, -1): rPrimet = rPrime[:, t][:, None, :] #x1ct = x1c[:,t][:,None,:] ##x1ct = x1c[:,t] beta = gamma[:, :-1] beta = beta.dot(self.rw.T) gamma[:, 0] = vwDelta[:, t] gamma[:, 1:] = beta gamma *= rPrimet ##x1ctf = np.tile(x1ct, unrollSteps).reshape((-1, x1ct.shape[-1])) ##gammaf = gamma.reshape((-1, gamma.shape[-1])) delta[:, t] = gamma.sum(axis=1) #hg += x1ctf.T.dot(gammaf) ##hg += x1ct.T.dot(gamma.sum(axis=1)) ##hg += x1ct.T.dot(gamma.swapaxes(0,1)).sum(axis=1) x1cf = x1c.reshape((-1, x1c.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) #hg[...] = x1c.reshape((-1, x1c.shape[-1])).T.dot(delta.reshape((-1, d.shape[-1]))) hg[...] = x1cf.T.dot(deltaf) if returnError: return np.mean(e**2), pg else: return pg
def __init__(self, x, g, convs=((8, 16), (16, 8)), nHidden=None, transFunc=transfer.lecun, weightInitFunc=pinit.lecun, penalty=None, elastic=1.0, optimFunc=optim.scg, **kwargs): x = util.segmat(x) g = util.segmat(g) self.dtype = np.result_type(x.dtype, g.dtype) Regression.__init__(self, x.shape[2], g.shape[2]) optim.Optable.__init__(self) self.nConvHiddens, self.convWidths = zip(*convs) self.nConvLayers = len(convs) self.nHidden = nHidden self.layerDims = [ (self.nIn * self.convWidths[0] + 1, self.nConvHiddens[0]), ] for l in range(1, self.nConvLayers): ni = self.nConvHiddens[l - 1] * self.convWidths[l] + 1 no = self.nConvHiddens[l] self.layerDims.append((ni, no)) if self.nHidden is None: self.layerDims.append((self.nConvHiddens[-1] + 1, self.nOut)) else: self.layerDims.append((self.nConvHiddens[-1] + 1, self.nHidden)) self.layerDims.append((self.nHidden + 1, self.nOut)) self.transFunc = transFunc if util.isiterable(transFunc) \ else (transFunc,) * (len(self.layerDims)-1) assert len(self.transFunc) == (len(self.layerDims) - 1) views = util.packedViews(self.layerDims, dtype=self.dtype) self.pw = views[0] if self.nHidden is None: self.cws = views[1:-1] self.hw = None self.vw = views[-1] else: self.cws = views[1:-2] self.hw = views[-2] self.vw = views[-1] if not util.isiterable(weightInitFunc): weightInitFunc = (weightInitFunc, ) * (self.nConvLayers + 2) assert len(weightInitFunc) == (len(self.cws) + 2) self.penalty = penalty if self.penalty is not None: if not util.isiterable(self.penalty): self.penalty = (self.penalty, ) * (self.nConvLayers + 2) assert (self.penalty is None) or (len(self.penalty) == (len(self.cws) + 2)) self.elastic = elastic if util.isiterable(elastic) \ else (elastic,) * (self.nConvLayers+2) assert (len(self.elastic) == (len(self.cws) + 2)) # initialize weights for cw, wif in zip(self.cws, weightInitFunc): cw[...] = wif(cw.shape).astype(self.dtype, copy=False) if self.nHidden is not None: self.hw[...] = weightInitFunc[-2](self.hw.shape).astype(self.dtype, copy=False) self.vw[...] = weightInitFunc[-1](self.vw.shape).astype(self.dtype, copy=False) # train the network if optimFunc is not None: self.train(x, g, optimFunc, **kwargs)
def gradient(self, x, g, returnError=True): x = util.segmat(x) g = util.colmat(g) # packed views of the hidden and visible gradient matrices views = util.packedViews(self.layerDims, dtype=self.dtype) pg = views[0] if self.nHidden is None: cgs = views[1:-1] hg = None vg = views[-1] else: cgs = views[1:-2] hg = views[-2] vg = views[-1] # forward pass c = x c1s = [] cPrimes = [] for l, cw in enumerate(self.cws): width = self.convWidths[l] phi = self.transFunc[l] c = util.timeEmbed(c, lags=width - 1, axis=1) c1 = util.bias(c) c1s.append(c1) h = util.segdot(c1, cw) cPrime = phi(h, 1) cPrimes.append(cPrime) c = phi(h) c1 = util.bias(c) # evaluate hidden and visible layers if self.nHidden is None: y = util.segdot(c1, self.vw) else: h = util.segdot(c1, self.hw) z1 = util.bias(self.transFunc[-1](h)) zPrime = self.transFunc[-1](h, 1) y = util.segdot(z1, self.vw) # error components trim = (g.shape[1] - y.shape[1]) // 2 gTrim = g[:, :(g.shape[1] - trim)] gTrim = gTrim[:, -y.shape[1]:] # error components e = util.colmat(y - gTrim) delta = 2.0 * e / e.size if self.nHidden is None: # visible layer gradient c1f = c1.reshape((-1, c1.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) vg[...] = c1f.T.dot(deltaf) vg += self.penaltyGradient(-1) delta = util.segdot(delta, self.vw[:-1].T) else: # visible layer gradient z1f = z1.reshape((-1, z1.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) vg[...] = z1f.T.dot(deltaf) vg += self.penaltyGradient(-1) # hidden layer gradient c1f = c1.reshape((-1, c1.shape[-1])) delta = util.segdot(delta, self.vw[:-1].T) * zPrime deltaf = delta.reshape((-1, delta.shape[-1])) hg[...] = c1f.T.dot(deltaf) hg += self.penaltyGradient(-2) delta = util.segdot(delta, self.hw[:-1].T) # backward pass for convolutional layers for l in range(self.nConvLayers - 1, -1, -1): c1 = c1s[l] cPrime = cPrimes[l] delta = delta[:, :cPrime.shape[1]] * cPrime c1f = c1.reshape((-1, c1.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) cgs[l][...] = c1f.T.dot(deltaf) cgs[l] += self.penaltyGradient(l) if l > 0: # won't propigate back to inputs delta = util.segdot(delta, self.cws[l][:-1].T) delta = deltaDeEmbedSum(delta, self.convWidths[l]) if returnError: error = np.mean(e**2) + self.penaltyError() return error, pg else: return pg
def __init__( self, x, g, recs=(8, 4, 2), transient=0, phi=transfer.tanh, #iwInitFunc=pinit.lecun, rwInitFunc=pinit.lecun, hwInitFunc=pinit.esp, vwInitFunc=pinit.lecun, optimFunc=optim.scg, **kwargs): x = util.segmat(x) g = util.segmat(g) self.dtype = np.result_type(x.dtype, g.dtype) Regression.__init__(self, x.shape[2], g.shape[2]) optim.Optable.__init__(self) self.transient = transient self.phi = phi self.nRecHiddens = list(recs) self.nRecLayers = len(self.nRecHiddens) self.layerDims = [(self.nIn + self.nRecHiddens[0] + 1, self.nRecHiddens[0])] for l in range(1, self.nRecLayers): self.layerDims.append( (self.nRecHiddens[l - 1] + self.nRecHiddens[l] + 1, self.nRecHiddens[l])) self.layerDims.append((self.nRecHiddens[-1] + 1, self.nOut)) views = util.packedViews(self.layerDims, dtype=self.dtype) self.pw = views[0] self.hws = views[1:-1] self.vw = views[-1] self.iws = [] self.rws = [] nIn = self.nIn for l in range(self.nRecLayers): iw = self.hws[l][:(nIn + 1)] rw = self.hws[l][(nIn + 1):] self.iws.append(iw) self.rws.append(rw) #self.iws[l][...] = iwInitFunc(iw.shape).astype(self.dtype, copy=False) #self.rws[l][...] = rwInitFunc(rw.shape).astype(self.dtype, copy=False) nIn = self.nRecHiddens[l] self.hws[l][...] = hwInitFunc(self.hws[l].shape).astype(self.dtype, copy=False) self.vw[...] = vwInitFunc(self.vw.shape).astype(self.dtype, copy=False) # train the network if optimFunc is not None: self.train(x, g, optimFunc, **kwargs)
def gradient(self, x, g, unrollSteps=10, returnError=True): x = util.segmat(x) g = util.segmat(g) if isinstance(unrollSteps, (int, )): unrollSteps = [ unrollSteps, ] * self.nRecLayers views = util.packedViews(self.layerDims, dtype=self.dtype) pg = views[0] hgs = views[1:-1] vg = views[-1] x1 = util.bias(x) nSeg = x1.shape[0] nObs = x1.shape[1] r1Prev = x1 r1cs = [] rPrimes = [] for l in range(self.nRecLayers): nIn1 = r1Prev.shape[2] r = np.empty((nSeg, nObs, self.nRecHiddens[l]), dtype=self.dtype) h = np.empty((nSeg, nObs, self.nRecHiddens[l]), dtype=self.dtype) r1c = np.empty((nSeg, nObs, nIn1 + self.nRecHiddens[l]), dtype=self.dtype) context = np.zeros((nSeg, self.nRecHiddens[l]), dtype=self.dtype) for t in range(nObs): r1c[:, t, :nIn1] = r1Prev[:, t] r1c[:, t, nIn1:] = context h[:, t] = r1c[:, t].dot(self.hws[l]) r[:, t] = self.phi(h[:, t]) context[...] = r[:, t] r1Prev = util.bias(r) r1cs.append(r1c) rPrime = self.phi(h, 1) rPrimes.append(rPrime) # evaluate visible layer r1 = r1Prev y = r1.dot(self.vw) # error components, ditch transient e = (y - g)[:, self.transient:] delta = np.zeros(g.shape, dtype=self.dtype) delta[:, self.transient:] = 2.0 * e / e.size # visible layer gradient r1f = r1.reshape((-1, r1.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) vg[...] = r1f.T.dot(deltaf) # backward pass through each layer w = self.vw for l in range(self.nRecLayers - 1, -1, -1): r1c = r1cs[l] rwsTrans = self.rws[l].T rPrime = rPrimes[l] deltaPrev = delta.dot(w[:-1].T) gamma = np.zeros((nSeg, unrollSteps[l], self.nRecHiddens[l]), dtype=self.dtype) #delta = np.zeros((nSeg, nObs-self.transient, self.nRecHiddens[l]), dtype=self.dtype) delta = np.zeros((nSeg, nObs, self.nRecHiddens[l]), dtype=self.dtype) # unrolled through time #for t in range(nObs-self.transient-1, 0, -1): for t in range(nObs - 1, 0, -1): rPrimet = rPrime[:, t][:, None, :] beta = gamma[:, :-1] beta = beta.dot(rwsTrans) gamma[:, 0] = deltaPrev[:, t] gamma[:, 1:] = beta gamma *= rPrimet delta[:, t] = gamma.sum(axis=1) r1cf = r1c.reshape((-1, r1c.shape[-1])) deltaf = delta.reshape((-1, delta.shape[-1])) hgs[l][...] = r1cf.T.dot(deltaf) #print('hg %d: %f' % (l, np.sqrt(np.mean(hgs[l]**2)))) w = self.iws[l] if returnError: return np.mean(e**2), pg else: return pg
def gradient(self, x, g, returnError=True): """Compute the gradient of the mean-squared error with respect to the network weights for each layer and given inputs and targets. Useful for optimization routines that make use of first-order gradients. Args: x: g: returnError: If True (default) then also return the mean-squared error. This can improve performance in some optimization routines by avoiding an additional forward pass. Returns: If returnError is True, then return a tuple containing the error followed by a 1d numpy array containing the gradient of the packed weights. If returnError is False, then only return the gradient. """ x = np.asarray(x) g = np.asarray(g) # packed views of the hidden and visible gradient matrices views = util.packedViews(self.layerDims, dtype=self.dtype) pg = views[0] hgs = views[1:-1] vg = views[-1] # forward pass z1 = util.bias(x) z1s = [z1] zPrimes = [] for hw, phi in zip(self.hws, self.transFunc): h = z1.dot(hw) z1 = util.bias(phi(h)) z1s.append(z1) zPrime = phi(h, 1) zPrimes.append(zPrime) v = z1.dot(self.vw) probs = util.softmax(v) # error components delta = util.colmat(probs - g) / probs.size # visible layer gradient vg[...] = z1.T.dot(delta) vg += self.penaltyGradient(-1) # backward pass for hidden layers w = self.vw for l in range(self.nHLayers-1, -1, -1): delta = delta.dot(w[:-1,:].T) * zPrimes[l] hgs[l][...] = z1s[l].T.dot(delta) hgs[l] += self.penaltyGradient(l) w = self.hws[l] if returnError: error = -np.mean(g*np.log(util.capZero(probs))) + self.penaltyError() return error, pg else: return pg
def __init__(self, classData, nHidden=10, transFunc=transfer.lecun, weightInitFunc=pinit.lecun, penalty=None, elastic=1.0, optimFunc=optim.scg, **kwargs): """Construct a new feedforward neural network. Args: classData: Training data. This is a numpy array or list of numpy arrays with shape (nCls,nObs[,nIn]). If the dimensions index is missing the data is assumed to be one-dimensional. nHidden: Number of hidden units. transFunc: Hidden layer transfer function. The default is transfer.lecun. See the transfer module for more. weightInitFunc: Function to initialize the weights in each layer. If a single function is given, it will be repeated for each layer with the visible layer last. The default function is the lecun function in the paraminit module. See the paraminit module for more choices. penalty: elastic: Penalty or weight decay. The cost function is then e + p * ||W||_2 where e is the prediction error, p is the penalty and ||W||_2 denotes the l2 norm of the weight matrix. This regularizes the network by pulling weights toward zero and toward each other. 1.0 is pure L2-norm --> between is elastic net 0.0 is pure L1-norm optimFunc: Function used to optimize the weight matrices. If None, initial training will be skipped. See ml.optim for some candidate optimization functions. kwargs: Additional arguments passed to optimFunc. Returns: A new, trained feedforward network. Refs: @incollection{lecun2012efficient, title={Efficient backprop}, author={LeCun, Yann A and Bottou, L{\'e}on and Orr, Genevieve B and M{\"u}ller, Klaus-Robert}, booktitle={Neural networks: Tricks of the trade}, pages={9--48}, year={2012}, publisher={Springer} } """ Classifier.__init__(self, util.colmat(classData[0]).shape[1], len(classData)) optim.Optable.__init__(self) self.dtype = np.result_type(*[cls.dtype for cls in classData]) self.nHidden = nHidden if util.isiterable(nHidden) else (nHidden,) self.nHLayers = len(self.nHidden) self.layerDims = [(self.nIn+1, self.nHidden[0])] for l in range(1, self.nHLayers): self.layerDims.append((self.nHidden[l-1]+1, self.nHidden[l])) self.layerDims.append((self.nHidden[-1]+1, self.nCls)) self.transFunc = transFunc if util.isiterable(transFunc) \ else (transFunc,) * self.nHLayers assert len(self.transFunc) == self.nHLayers views = util.packedViews(self.layerDims, dtype=self.dtype) self.pw = views[0] self.hws = views[1:-1] self.vw = views[-1] if not util.isiterable(weightInitFunc): weightInitFunc = (weightInitFunc,) * (self.nHLayers+1) assert len(weightInitFunc) == (len(self.hws) + 1) # initialize weights for hw, wif in zip(self.hws, weightInitFunc): hw[...] = wif(hw.shape).astype(self.dtype, copy=False) self.vw[...] = weightInitFunc[-1](self.vw.shape).astype(self.dtype, copy=False) self.penalty = penalty if self.penalty is not None: if not util.isiterable(self.penalty): self.penalty = (self.penalty,) * (self.nHLayers+1) assert (self.penalty is None) or (len(self.penalty) == (len(self.hws) + 1)) self.elastic = elastic if util.isiterable(elastic) \ else (elastic,) * (self.nHLayers+1) assert (len(self.elastic) == (len(self.hws) + 1)) # train the network if optimFunc is not None: self.train(classData, optimFunc, **kwargs)
def __init__(self, x, g, nHidden=10, transFunc=transfer.lecun, weightInitFunc=pinit.lecun, penalty=None, elastic=1.0, optimFunc=optim.scg, **kwargs): x = np.asarray(x) g = np.asarray(g) self.dtype = np.result_type(x.dtype, g.dtype) self.flattenOut = False if g.ndim > 1 else True Regression.__init__(self, util.colmat(x).shape[1], util.colmat(g).shape[1]) optim.Optable.__init__(self) self.nHidden = nHidden if util.isiterable(nHidden) else (nHidden, ) self.nHLayers = len(self.nHidden) self.layerDims = [(self.nIn + 1, self.nHidden[0])] for l in range(1, self.nHLayers): self.layerDims.append((self.nHidden[l - 1] + 1, self.nHidden[l])) self.layerDims.append((self.nHidden[-1] + 1, self.nOut)) self.transFunc = transFunc if util.isiterable(transFunc) \ else (transFunc,) * self.nHLayers assert len(self.transFunc) == self.nHLayers views = util.packedViews(self.layerDims, dtype=self.dtype) self.pw = views[0] self.hws = views[1:-1] self.vw = views[-1] if not util.isiterable(weightInitFunc): weightInitFunc = (weightInitFunc, ) * (self.nHLayers + 1) assert len(weightInitFunc) == (len(self.hws) + 1) # initialize weights for hw, wif in zip(self.hws, weightInitFunc): hw[...] = wif(hw.shape).astype(self.dtype, copy=False) self.vw[...] = weightInitFunc[-1](self.vw.shape).astype(self.dtype, copy=False) self.penalty = penalty if self.penalty is not None: if not util.isiterable(self.penalty): self.penalty = (self.penalty, ) * (self.nHLayers + 1) assert (self.penalty is None) or (len(self.penalty) == (len(self.hws) + 1)) self.elastic = elastic if util.isiterable(elastic) \ else (elastic,) * (self.nHLayers+1) assert (len(self.elastic) == (len(self.hws) + 1)) # train the network if optimFunc is not None: self.train(x, g, optimFunc, **kwargs)