def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] w = params[: self.m_end].reshape(self.shape) cae = gpu.sum(gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * gpu.sum(w ** 2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[: self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0] :] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * w cae_grad += gdot(inpts.T, (Dsigmoid(hddn) ** 2 * (1 - 2 * hddn))) / m * gpu.sum(w ** 2, axis=0) g[: self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot(delta, params[: self.m_end].reshape(self.shape)) g[: self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end : -self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end + self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] w = params[:self.m_end].reshape(self.shape) cae = gpu.sum( gpu.mean(Dsigmoid(hddn)**2, axis=0) * gpu.sum(w**2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn)**2, axis=0) * w cae_grad += (gdot(inpts.T, (Dsigmoid(hddn)**2 * (1 - 2 * hddn))) / m * gpu.sum(w**2, axis=0)) g[:self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot( delta, params[:self.m_end].reshape(self.shape)) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[self.size:-self.shape[0]] = gdot(hddn.T, delta).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] = gdot(inpts.T, dsc_dha).ravel() g[self.m_end:self.size] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pt_init(self, init_var=1e-2, init_bias=0., rho=0.5, lmbd=0., l2=0., SI=15, **kwargs): """ """ # 2*self.shape[0]: precision parameters have size shape[0] pt_params = gzeros(self.m_end + self.shape[1] + 2 * self.shape[0]) if init_var is None: pt_params[:self.m_end] = gpu.garray( init_SI(self.shape, sparsity=SI)).ravel() else: pt_params[:self.m_end] = init_var * gpu.randn(self.m_end) pt_params[self.m_end:-self.shape[0]] = init_bias pt_params[-self.shape[0]:] = 1. self.pt_score = self.reconstruction self.pt_grad = self.grad_cd1 self.l2 = l2 self.rho = rho self.lmbd = lmbd self.rho_hat = None return pt_params
def pt_init(self, H=bernoulli, V=bernoulli, init_var=1e-2, init_bias=0., rho=0.5, lmbd=0., l2=0., **kwargs): pt_params = gzeros(self.m_end + self.shape[1] + self.shape[0]) if init_var is None: init_heur = 4 * np.sqrt(6. / (self.shape[0] + self.shape[1])) pt_params[:self.m_end] = gpu.rand(self.m_end) pt_params[:self.m_end] *= 2 pt_params[:self.m_end] -= 1 pt_params[:self.m_end] *= init_heur else: pt_params[:self.m_end] = init_var * gpu.randn(self.m_end) pt_params[self.m_end:] = init_bias self.H = H self.V = V self.activ = match_table[H] self.pt_score = self.reconstruction self.pt_grad = self.grad_cd1 self.l2 = l2 self.rho = rho self.lmbd = lmbd self.rho_hat = None return pt_params
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def grad(self, params, inputs, targets, **kwargs): data = inputs for layer, (c1, c2) in izip(self.encoder, izip(self.enc[:-1], self.enc[1:])): data = layer.fprop(self.params[c1:c2], data) # possible spot for semisupervision? for layer, (c1, c2) in izip(self.decoder, izip(self.dec[:-1], self.dec[1:])): data = layer.fprop(self.params[c1:c2], data) _, delta = self._score(data, inputs, error=True) g = gzeros(self.psize) for layer, (c1, c2) in izip(self.decoder[::-1], izip(self.dec[-2::-1], self.dec[:0:-1])): delta = layer.bprop(params=params[c1:c2], grad=g[c1:c2], delta=delta) # in case: fuse in gradient from semisupervision for layer, (c1, c2) in izip(self.encoder[::-1], izip(self.enc[-2::-1], self.enc[:0:-1])): delta = layer.bprop(params=params[c1:c2], grad=g[c1:c2], delta=delta) return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) hddn = self.activ( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) _hddn = hddn.as_numpy_array() idxs = np.argsort(_hddn, axis=1) _hddn[range(_hddn.shape[0]), idxs[:, self.ak :].T] = 0 hddn = gpu.garray(_hddn) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] _, delta = self.score(Z, inpts, error=True) g[: self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0] :] = delta.sum(axis=0) dsc_dha = gdot(delta, params[: self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn) g[: self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end : -self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta return g
def pt_init(self, H=bernoulli, V=bernoulli, init_var=1e-2, init_bias=0., rho=0.5, lmbd=0., l2=0., **kwargs): pt_params = gzeros(self.m_end + self.shape[1] + self.shape[0]) if init_var is None: init_heur = 4*np.sqrt(6./(self.shape[0]+self.shape[1])) pt_params[:self.m_end] = gpu.rand(self.m_end) pt_params[:self.m_end] *= 2 pt_params[:self.m_end] -= 1 pt_params[:self.m_end] *= init_heur else: pt_params[:self.m_end] = init_var * gpu.randn(self.m_end) pt_params[self.m_end:] = init_bias self.H = H self.V = V self.activ = match_table[H] self.pt_score = self.reconstruction self.pt_grad = self.grad_cd1 self.l2 = l2 self.rho = rho self.lmbd = lmbd self.rho_hat = None return pt_params
def reload(self, _pt_params): """ """ if self.p is None: self.p = gzeros(self.size) pt_params = gpu.as_garray(_pt_params) self.prep_layer(pt_params) del pt_params
def grad_cd1(self, params, inputs, **kwargs): """ """ g = gzeros(params.shape) n, _ = inputs.shape m_end = self.m_end V = self.shape[0] H = self.shape[1] wm = params[:m_end].reshape(self.shape) prec = params[-V:][:, gpu.newaxis] h1, h_sampled = self.H(inputs, wm=prec*wm, bias=params[m_end:m_end+H], sampling=True) v2, v_sampled = gauss(h_sampled, wm=(wm/prec).T, bias=params[-(2*V):-V], prec=prec.T, sampling=True) h2, _ = self.H(v2, wm=prec*wm, bias=params[m_end:m_end+H]) #print h1[0,0], h_sampled[0,0], v2[0,0], v_sampled[0,0] # Note the negative sign: the gradient is # supposed to point into 'wrong' direction. g[:m_end] = -gdot(inputs.T*prec, h1).ravel() g[:m_end] += gdot(v_sampled.T*prec, h2).ravel() g[:m_end] *= 1./n g[:m_end] += self.l2*params[:m_end] g[m_end:m_end+H] = -h1.sum(axis=0) g[m_end:m_end+H] += h2.sum(axis=0) g[m_end:m_end+H] *= 1./n g[-2*V:-V] = -inputs.sum(axis=0) g[-2*V:-V] += v_sampled.sum(axis=0) g[-2*V:-V] *= 1./n g[-2*V:-V] *= (prec**2).T #print gsum(g[:m_end]**2), gsum(g[m_end:m_end+H]**2), gsum(g[-2*V:-V]**2) # Gradient for square root of precision g[-V:] = -gsum(2*prec.T*inputs*(params[-2*V:-V] - inputs/2), axis=0) + gsum(gdot(inputs.T, h1)*wm, axis=1) g[-V:] += (gsum(2*prec.T*v_sampled*(params[-2*V:-V] - v_sampled/2), axis=0) + gsum(gdot(v_sampled.T, h2)*wm, axis=1)) g[-V:] *= 1./n #print gsum(g[-V:]**2) if self.lmbd > 0.: if self.rho_hat is None: self.rho_hat = h1.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * h1.mean(axis=0) dKL_drho_hat = (self.rho - self.rho_hat)/(self.rho_hat*(1-self.rho_hat)) h1_1mh1 = h1*(1 - h1) g[m_end:m_end+H] -= self.lmbd/n * gsum(h1_1mh1, axis=0) * dKL_drho_hat g[:m_end] -= self.lmbd/n * (gdot(inputs.T * prec, h1_1mh1) * dKL_drho_hat).ravel() #g[:] = -g[:] return g
def pt_grad(self, params, inputs, targets, l2=0, **kwargs): g = gzeros(params.shape) Z = self.activ(gpu.dot(inputs, params[:self.m_end].reshape(self.shape)) + params[self.m_end:]) _, delta = self.score(Z, targets, error=True) # necessary? delta = self.C * delta g[:self.m_end] = gdot(inputs.T, delta).ravel() + params[:self.m_end] g[self.m_end:] = delta.sum(axis=0) # clean up del delta return g
def grad(self, params, inputs, targets, **kwargs): data = inputs for layer, (c1, c2) in izip(self, izip(self.cuts[:-1], self.cuts[1:])): data = layer.fprop(self.params[c1:c2], data) _, delta = self._score(data, targets, error=True) g = gzeros(self.psize) for layer, (c1, c2) in izip(self[::-1], izip(self.cuts[-2::-1], self.cuts[:0:-1])): delta = layer.bprop(params=params[c1:c2], grad=g[c1:c2], delta=delta) return g
def pt_init(self, score=None, init_var=1e-2, init_bias=0., l2=0., SI=15, **kwargs): pt_params = gzeros(self.m_end + self.shape[1] + self.shape[0]) if init_var is None: pt_params[:self.m_end] = gpu.garray(init_SI(self.shape, sparsity=SI)).ravel() else: pt_params[:self.m_end] = init_var * gpu.randn(self.m_end) pt_params[self.m_end:] = init_bias self.score = score self.l2 = l2 return pt_params
def pt_grad(self, params, inputs, targets, l2=0, **kwargs): g = gzeros(params.shape) Z = self.activ(gpu.dot(inputs, params[:self.m_end].reshape(self.shape)) + params[self.m_end:]) _, delta = self.score(Z, targets, error=True) g[:self.m_end] = gdot(inputs.T, delta).ravel() g[self.m_end:] = delta.sum(axis=0) # clean up del delta return g
def pt_init(self, score=None, init_var=1e-2, init_bias=0., l2=0., SI=15, **kwargs): pt_params = gzeros(self.m_end + self.shape[0]) if init_var is None: pt_params[:self.m_end] = gpu.garray(init_SI(self.shape, sparsity=SI)).ravel() else: pt_params[:self.m_end] = init_var * gpu.randn(self.m_end) pt_params[self.m_end:] = init_bias self.score = score self.l2 = l2 return pt_params
def grad_cd1(self, params, inputs, **kwargs): """ """ g = gzeros(params.shape) n, _ = inputs.shape m_end = self.m_end V = self.shape[0] H = self.shape[1] wm = params[:m_end].reshape(self.shape) h1, h_sampled = self.H(inputs, wm=wm, bias=params[m_end:-V], sampling=True) v2, _ = self.V(h_sampled, wm=wm.T, bias=params[-V:]) h2, _ = self.H(v2, wm=wm, bias=params[m_end:-V]) # Note the negative sign: the gradient is # supposed to point into 'wrong' direction, # because the used optimizer likes to minimize. g[:m_end] = -gdot(inputs.T, h1).ravel() g[:m_end] += gdot(v2.T, h2).ravel() g[:m_end] *= 1. / n g[:m_end] += self.l2 * params[:m_end] g[m_end:-V] = -h1.mean(axis=0) g[m_end:-V] += h2.mean(axis=0) g[-V:] = -inputs.mean(axis=0) g[-V:] += v2.mean(axis=0) if self.rho_hat is None: self.rho_hat = h1.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * h1.mean(axis=0) dKL_drho_hat = (self.rho - self.rho_hat) / (self.rho_hat * (1 - self.rho_hat)) h1_1mh1 = h1 * (1 - h1) g[m_end:-V] -= self.lmbd / n * gsum(h1_1mh1, axis=0) * dKL_drho_hat g[:m_end] -= self.lmbd / n * (gdot(inputs.T, h1_1mh1) * dKL_drho_hat).ravel() return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) hddn = self.activ(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)), self.theta) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] _, delta = self.score(Z, inpts, error=True) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) dsc_dha = gdot(delta, params[:self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() # clean up del delta return g
def pt_init(self, init_var=1e-2, init_bias=0., avg_nxyf=0.1, avg_nfh=0.1, rho=0.5, lmbd=0., l2=0., **kwargs): """ """ pt_params = gzeros(self.size + self.shape[0][0] + self.shape[0][1]) pt_params[:self._cum_xyh] = init_var * gpu.randn(self._cum_xyh) self.pt_score = self.reconstruction self.pt_grad = self.cd1_3way_grad self.avg_nxyf = avg_nxyf self.avg_nfh = avg_nfh self.l2 = l2 self.rho = rho self.lmbd = lmbd self.rho_hat = None return pt_params
def pretrain(self, schedule): super(DAE, self).pretrain(schedule=schedule) p = self.params.as_numpy_array() pretrained = schedule["pretrained"] # How many parameters in the unrolled model? _dec = [] _enc = [0] self.psize = 0 for layer in self: _enc.append(layer.shape[0] * layer.shape[1] + layer.shape[1]) _dec.append(layer.shape[0] * layer.shape[1] + layer.shape[0]) self.psize += _enc[-1] + _dec[-1] self.enc = np.cumsum(_enc) _dec.append(0) _dec.reverse() self.dec = np.cumsum(_dec) + self.enc[-1] # Build up encoder and decoder self.encoder = [] self.params = gzeros(self.psize) for layer, (c1, c2) in izip(self, izip(self.enc[:-1], self.enc[1:])): self.encoder.append(layer) self.params[c1:c2] = p[c1:c2] layer.p = self.params[c1:c2] self.decoder = [] for layer, (c1, c2) in izip(self[-1::-1], izip(self.dec[:-1], self.dec[1:])): l = layer.transpose(self.params[c1:c2]) if pretrained: l.p[:l.m_end] = layer.p[:layer.m_end].reshape( layer.shape).T.ravel() self.decoder.append(l) # Fix missing activations of decoder for i, layer in enumerate(self[-2::-1]): self.decoder[i].activ = layer.activ self.decoder[-1].activ = idnty msg = {"msg": "DAE unrolled: %s" % self} munk.taggify(self.logging, "pretty").send(msg)
def pt_grad(self, params, noisy_inpts, targets, l2=0., **kwargs): g = gzeros(params.shape) hddn = self.activ(gpu.dot(noisy_inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] _, delta = self.score(Z, targets, error=True) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) dsc_dha = gdot(delta, params[:self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn) g[:self.m_end] += gdot(noisy_inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = self.activ(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] _, delta = self.score(Z, inpts, error=True) g[self.end:-self.shape[0]] = gdot(hddn.T, delta).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = diff_table[self.activ](hddn) dsc_dha = diff * gdot(delta, params[:self.m_end].reshape(self.shape)) g[:self.m_end] = gdot(inpts.T, dsc_dha).ravel() g[self.m_end:self.size] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def grad_cd1(self, params, inputs, **kwargs): """ """ g = gzeros(params.shape) n, _ = inputs.shape m_end = self.m_end V = self.shape[0] H = self.shape[1] wm = params[:m_end].reshape(self.shape) h1, h_sampled = self.H(inputs, wm=wm, bias=params[m_end:-V], sampling=True) v2, _ = self.V(h_sampled, wm=wm.T, bias=params[-V:]) h2, _ = self.H(v2, wm=wm, bias=params[m_end:-V]) # Note the negative sign: the gradient is # supposed to point into 'wrong' direction, # because the used optimizer likes to minimize. g[:m_end] = -gdot(inputs.T, h1).ravel() g[:m_end] += gdot(v2.T, h2).ravel() g[:m_end] *= 1./n g[:m_end] += self.l2*params[:m_end] g[m_end:-V] = -h1.mean(axis=0) g[m_end:-V] += h2.mean(axis=0) g[-V:] = -inputs.mean(axis=0) g[-V:] += v2.mean(axis=0) if self.rho_hat is None: self.rho_hat = h1.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * h1.mean(axis=0) dKL_drho_hat = (self.rho - self.rho_hat)/(self.rho_hat*(1-self.rho_hat)) h1_1mh1 = h1*(1 - h1) g[m_end:-V] -= self.lmbd/n * gsum(h1_1mh1, axis=0) * dKL_drho_hat g[:m_end] -= self.lmbd/n * (gdot(inputs.T, h1_1mh1) * dKL_drho_hat).ravel() return g
def pretrain(self, schedule): super(DAE, self).pretrain(schedule=schedule) p = self.params.as_numpy_array() pretrained = schedule["pretrained"] # How many parameters in the unrolled model? _dec = [] _enc = [0] self.psize = 0 for layer in self: _enc.append(layer.shape[0] * layer.shape[1] + layer.shape[1]) _dec.append(layer.shape[0] * layer.shape[1] + layer.shape[0]) self.psize += _enc[-1] + _dec[-1] self.enc = np.cumsum(_enc) _dec.append(0) _dec.reverse() self.dec = np.cumsum(_dec) + self.enc[-1] # Build up encoder and decoder self.encoder = [] self.params = gzeros(self.psize) for layer, (c1, c2) in izip(self, izip(self.enc[:-1], self.enc[1:])): self.encoder.append(layer) self.params[c1:c2] = p[c1:c2] layer.p = self.params[c1:c2] self.decoder = [] for layer, (c1, c2) in izip(self[-1::-1], izip(self.dec[:-1], self.dec[1:])): l = layer.transpose(self.params[c1:c2]) if pretrained: l.p[: l.m_end] = layer.p[: layer.m_end].reshape(layer.shape).T.ravel() self.decoder.append(l) # Fix missing activations of decoder for i, layer in enumerate(self[-2::-1]): self.decoder[i].activ = layer.activ self.decoder[-1].activ = idnty msg = {"msg": "DAE unrolled: %s" % self} munk.taggify(self.logging, "pretty").send(msg)
def pt_init(self, score=None, init_var=1e-2, init_bias=0., **kwargs): pt_params = gzeros(self.size + self.m_end + self.shape[0]) if init_var is None: init_heur = 4*np.sqrt(6./(self.shape[0]+self.shape[1])) pt_params[:self.m_end] = gpu.rand(self.m_end) pt_params[:self.m_end] *= 2 pt_params[:self.m_end] -= 1 pt_params[:self.m_end] *= init_heur pt_params[self.size:-self.shape[0]] = gpu.rand(self.m_end) pt_params[self.size:-self.shape[0]] *= 2 pt_params[self.size:-self.shape[0]] -= 1 pt_params[self.size:-self.shape[0]] *= init_heur else: pt_params[:self.m_end] = init_var * gpu.randn(self.m_end) pt_params[self.size:-self.shape[0]] = init_var * gpu.randn(self.m_end) pt_params[self.m_end:self.size] = init_bias pt_params[-self.shape[0]:] = init_bias self.score = score return pt_params
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) hddn = self.activ( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)), self.theta) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] _, delta = self.score(Z, inpts, error=True) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) dsc_dha = gdot(delta, params[:self.m_end].reshape( self.shape)) * diff_table[self.activ](hddn) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() # clean up del delta return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) hddn = self.activ(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) _hddn= hddn.as_numpy_array() idxs = np.argsort(_hddn, axis=1) _hddn[range(_hddn.shape[0]), idxs[:, self.ak:].T] = 0 hddn = gpu.garray(_hddn) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] _, delta = self.score(Z, inpts, error=True) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) dsc_dha = gdot(delta, params[:self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta return g
def pt_init(self, init_var=1e-2, init_bias=0., rho=0.5, lmbd=0., l2=0., SI=15, **kwargs): """ """ # 2*self.shape[0]: precision parameters have size shape[0] pt_params = gzeros(self.m_end + self.shape[1] + 2*self.shape[0]) if init_var is None: pt_params[:self.m_end] = gpu.garray(init_SI(self.shape, sparsity=SI)).ravel() else: pt_params[:self.m_end] = init_var * gpu.randn(self.m_end) pt_params[self.m_end:-self.shape[0]] = init_bias pt_params[-self.shape[0]:] = 1. self.pt_score = self.reconstruction self.pt_grad = self.grad_cd1 self.l2 = l2 self.rho = rho self.lmbd = lmbd self.rho_hat = None return pt_params
def __init__(self, ind, schedule): gpu.seed_rand(seed=None) self.logging = schedule["logging"] self.psize = 0 cuts = [0] self.stack = schedule["stack"] for layer in self.stack: ltype = layer["type"] units = layer["units"] l = ltype.__new__(ltype) l.__init__(shape=(ind, units), **layer) self.psize += l.size self.append(l) cuts.append(l.size) ind = units self.params = gzeros(self.psize) self.cuts = np.cumsum(cuts) for layer, (c1, c2) in izip(self, izip(self.cuts[:-1], self.cuts[1:])): layer.p = self.params[c1:c2] if "score" in schedule: self._score = schedule["score"] else: print("You may have a problem: _score_ is NONE") self._score = None
def grad_cd1(self, params, inputs, **kwargs): """ """ g = gzeros(params.shape) n, _ = inputs.shape m_end = self.m_end V = self.shape[0] H = self.shape[1] wm = params[:m_end].reshape(self.shape) prec = params[-V:][:, gpu.newaxis] h1, h_sampled = self.H(inputs, wm=prec * wm, bias=params[m_end:m_end + H], sampling=True) v2, v_sampled = gauss(h_sampled, wm=(wm / prec).T, bias=params[-(2 * V):-V], prec=prec.T, sampling=True) h2, _ = self.H(v2, wm=prec * wm, bias=params[m_end:m_end + H]) #print h1[0,0], h_sampled[0,0], v2[0,0], v_sampled[0,0] # Note the negative sign: the gradient is # supposed to point into 'wrong' direction. g[:m_end] = -gdot(inputs.T * prec, h1).ravel() g[:m_end] += gdot(v_sampled.T * prec, h2).ravel() g[:m_end] *= 1. / n g[:m_end] += self.l2 * params[:m_end] g[m_end:m_end + H] = -h1.sum(axis=0) g[m_end:m_end + H] += h2.sum(axis=0) g[m_end:m_end + H] *= 1. / n g[-2 * V:-V] = -inputs.sum(axis=0) g[-2 * V:-V] += v_sampled.sum(axis=0) g[-2 * V:-V] *= 1. / n g[-2 * V:-V] *= (prec**2).T #print gsum(g[:m_end]**2), gsum(g[m_end:m_end+H]**2), gsum(g[-2*V:-V]**2) # Gradient for square root of precision g[-V:] = -gsum(2 * prec.T * inputs * (params[-2 * V:-V] - inputs / 2), axis=0) + gsum(gdot(inputs.T, h1) * wm, axis=1) g[-V:] += (gsum(2 * prec.T * v_sampled * (params[-2 * V:-V] - v_sampled / 2), axis=0) + gsum(gdot(v_sampled.T, h2) * wm, axis=1)) g[-V:] *= 1. / n #print gsum(g[-V:]**2) if self.lmbd > 0.: if self.rho_hat is None: self.rho_hat = h1.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * h1.mean(axis=0) dKL_drho_hat = (self.rho - self.rho_hat) / (self.rho_hat * (1 - self.rho_hat)) h1_1mh1 = h1 * (1 - h1) g[m_end:m_end + H] -= self.lmbd / n * gsum(h1_1mh1, axis=0) * dKL_drho_hat g[:m_end] -= self.lmbd / n * (gdot(inputs.T * prec, h1_1mh1) * dKL_drho_hat).ravel() #g[:] = -g[:] return g
def cd1_3way_grad(self, params, inputs, **kwargs): """ """ g = gzeros(params.shape) x, y = inputs n, _ = x.shape #print self.avg_nxyf, self.avg_nfh weights_xf = params[:self.xf_sz].reshape(self.xfshape) weights_yf = params[self.xf_sz:self._cum_xy].reshape(self.yfshape) weights_fh = params[self._cum_xy:self._cum_xyh].reshape(self.fhshape) bias_h = params[self._cum_xyh:self.size] bias_x = params[self.size:-self.shape[0][1]] bias_y = params[-self.shape[0][1]:] # normalize weights sq_xf = weights_xf * weights_xf norm_xf = gpu.sqrt(sq_xf.sum(axis=0)) + SMALL sq_yf = weights_yf * weights_yf norm_yf = gpu.sqrt(sq_yf.sum(axis=0)) + SMALL norm_xyf = (norm_xf.mean() + norm_yf.mean())/2. self.avg_nxyf *= 0.95 self.avg_nxyf += (0.05 * norm_xyf) weights_xf *= (self.avg_nxyf / norm_xf) weights_yf *= (self.avg_nxyf / norm_yf) sq_fh = weights_fh*weights_fh norm_fh = gpu.sqrt(sq_fh.sum(axis=1)) + SMALL self.avg_nfh *= 0.95 self.avg_nfh += (0.05 * norm_fh.mean()) weights_fh *= (self.avg_nfh / norm_fh[:, gpu.newaxis]) # normalization done factors_x = gdot(x, weights_xf) factors_y = gdot(y, weights_yf) factors = factors_x * factors_y h, h_sampled = bernoulli(factors, wm=weights_fh, bias=bias_h, sampling=True) factors_h = gdot(h_sampled, weights_fh.T) g[:self.xf_sz] = -gdot(x.T, factors_y*factors_h).ravel() g[self.xf_sz:self._cum_xy] = -gdot(y.T, factors_x*factors_h).ravel() g[self._cum_xy:self._cum_xyh] = -gdot(factors.T, h_sampled).ravel() g[self._cum_xyh:self.size] = -h.sum(axis=0) g[self.size:-self.shape[0][1]] = -x.sum(axis=0) g[-self.shape[0][1]:] = -y.sum(axis=0) # 3way cd way = np.random.rand() > 0.5 if way: # reconstruct y (output) first. tmp = factors_x * factors_h y1, _ = self.V(tmp, wm=weights_yf.T, bias=bias_y) factors_y[:] = gdot(y1, weights_yf) # then reconstruct x (input). tmp = factors_y * factors_h x1, _ = self.V(tmp, wm=weights_xf.T, bias=bias_x) factors_x[:] = gdot(x1, weights_xf) else: # reconstruct x (input) first. tmp = factors_y * factors_h x1, _ = self.V(tmp, wm=weights_xf.T, bias=bias_x) factors_x[:] = gdot(x1, weights_xf) # then reconstruct y (output). tmp = factors_x * factors_h y1, _ = self.V(tmp, wm=weights_yf.T, bias=bias_y) factors_y[:] = gdot(y1, weights_yf) factors[:] = factors_x * factors_y h1, _ = bernoulli(factors, wm=weights_fh, bias=bias_h) factors_h[:] = gdot(h1, weights_fh.T) g[:self.xf_sz] += gdot(x1.T, factors_y*factors_h).ravel() g[:self.xf_sz] *= 1./n g[self.xf_sz:self._cum_xy] += gdot(y1.T, factors_x*factors_h).ravel() g[self.xf_sz:self._cum_xy] *= 1./n g[self._cum_xy:self._cum_xyh] += gdot(factors.T, h1).ravel() g[self._cum_xy:self._cum_xyh] *= 1./n g[self._cum_xyh:self.size] += h1.sum(axis=0) g[self._cum_xyh:self.size] *= 1./n g[self.size:-self.shape[0][1]] += x1.sum(axis=0) g[self.size:-self.shape[0][1]] *= 1./n g[-self.shape[0][1]:] += y1.sum(axis=0) g[-self.shape[0][1]:] *= 1./n return g