def get_train(self, batchsize=None, testsize=None): sx = tt.tensor4() sy = tt.ivector() yc = self._propup(sx, batchsize, noise=False) if 1: cost = -tt.log(tt.nnet.softmax(yc))[tt.arange(sy.shape[0]), sy].mean() else: from hinge import multi_hinge_margin cost = multi_hinge_margin(yc, sy).mean() error = tt.neq(tt.argmax(yc, axis=1), sy).mean() # get updates params = self.params grads = dict(zip(params, theano.grad(cost, params))) updates = collections.OrderedDict() for layer in self.layers: updates.update(layer.updates(grads)) train = theano.function( [sx, sy], [cost, error], updates=updates) # --- make test function y_pred = tt.argmax(self._propup(sx, testsize, noise=False), axis=1) error = tt.mean(tt.neq(y_pred, sy)) test = theano.function([sx, sy], error) return train, test
def train_classifier(self, train, test, n_epochs=30): dtype = theano.config.floatX # --- find codes images, labels = train n_labels = len(np.unique(labels)) codes = self.encode(images.astype(dtype)) codes = theano.shared(codes.astype(dtype), name='codes') labels = tt.cast(theano.shared(labels.astype(dtype), name='labels'), 'int32') # --- compute backprop function Wshape = (self.autos[-1].n_hid, n_labels) x = tt.matrix('x', dtype=dtype) y = tt.ivector('y') W = tt.matrix('W', dtype=dtype) b = tt.vector('b', dtype=dtype) W0 = np.random.normal(size=Wshape).astype(dtype).flatten() / 10 b0 = np.zeros(n_labels) split_p = lambda p: [p[:-n_labels].reshape(Wshape), p[-n_labels:]] form_p = lambda params: np.hstack([p.flatten() for p in params]) # # compute negative log likelihood # p_y_given_x = tt.nnet.softmax(tt.dot(x, W) + b) # y_pred = tt.argmax(p_y_given_x, axis=1) # nll = -tt.mean(tt.log(p_y_given_x)[tt.arange(y.shape[0]), y]) # error = tt.mean(tt.neq(y_pred, y)) # compute hinge loss yc = tt.dot(x, W) + b cost = multi_hinge_margin(yc, y).mean() error = cost # compute gradients grads = tt.grad(cost, [W, b]) f_df = theano.function([W, b], [error] + grads, givens={ x: codes, y: labels }) # --- begin backprop def f_df_wrapper(p): w, b = split_p(p) outs = f_df(w.astype(dtype), b.astype(dtype)) cost, grad = outs[0], form_p(outs[1:]) return cost.astype('float64'), grad.astype('float64') p0 = form_p([W0, b0]) p_opt, mincost, info = scipy.optimize.lbfgsb.fmin_l_bfgs_b( f_df_wrapper, p0, maxfun=n_epochs, iprint=1) self.W, self.b = split_p(p_opt)
def backprop(self, train_set, test_set, noise=0, shift=False, n_epochs=30): dtype = theano.config.floatX params = [] for auto in self.autos: params.extend([auto.W, auto.c]) # --- compute backprop function assert self.W is not None and self.b is not None W = theano.shared(self.W.astype(dtype), name='Wc') b = theano.shared(self.b.astype(dtype), name='bc') x = tt.matrix('batch') y = tt.ivector('labels') # compute coding error # p_y_given_x = tt.nnet.softmax(tt.dot(self.propup(x), W) + b) # y_pred = tt.argmax(p_y_given_x, axis=1) # nll = -tt.mean(tt.log(p_y_given_x)[tt.arange(y.shape[0]), y]) # error = tt.mean(tt.neq(y_pred, y)) # compute classification error yn = self.propup(x, noise=noise) yc = tt.dot(yn, W) + b cost = multi_hinge_margin(yc, y).mean() error = tt.mean(tt.neq(tt.argmax(yc, axis=1), y)) # compute gradients grads = tt.grad(cost, params) f_df = theano.function([x, y], [error] + grads) np_params = [param.get_value() for param in params] # --- run L_BFGS train_images, train_labels = train_set train_labels = train_labels.astype('int32') def f_df_wrapper(p): for param, value in zip(params, split_params(p, np_params)): param.set_value(value.astype(param.dtype)) images = shift_images(train_images, (28, 28)) if shift else train_images labels = train_labels outs = f_df(images, labels) cost, grads = outs[0], outs[1:] grad = join_params(grads) return cost.astype('float64'), grad.astype('float64') p0 = join_params(np_params) p_opt, mincost, info = scipy.optimize.lbfgsb.fmin_l_bfgs_b( f_df_wrapper, p0, maxfun=n_epochs, iprint=1) for param, value in zip(params, split_params(p_opt, np_params)): param.set_value(value.astype(param.dtype), borrow=False)
def train_classifier(self, train, test, n_epochs=30): dtype = theano.config.floatX # --- find codes images, labels = train n_labels = len(np.unique(labels)) codes = self.encode(images.astype(dtype)) codes = theano.shared(codes.astype(dtype), name='codes') labels = tt.cast(theano.shared(labels.astype(dtype), name='labels'), 'int32') # --- compute backprop function Wshape = (self.autos[-1].n_hid, n_labels) x = tt.matrix('x', dtype=dtype) y = tt.ivector('y') W = tt.matrix('W', dtype=dtype) b = tt.vector('b', dtype=dtype) W0 = np.random.normal(size=Wshape).astype(dtype).flatten() / 10 b0 = np.zeros(n_labels) split_p = lambda p: [p[:-n_labels].reshape(Wshape), p[-n_labels:]] form_p = lambda params: np.hstack([p.flatten() for p in params]) # # compute negative log likelihood # p_y_given_x = tt.nnet.softmax(tt.dot(x, W) + b) # y_pred = tt.argmax(p_y_given_x, axis=1) # nll = -tt.mean(tt.log(p_y_given_x)[tt.arange(y.shape[0]), y]) # error = tt.mean(tt.neq(y_pred, y)) # compute hinge loss yc = tt.dot(x, W) + b cost = multi_hinge_margin(yc, y).mean() error = cost # compute gradients grads = tt.grad(cost, [W, b]) f_df = theano.function( [W, b], [error] + grads, givens={x: codes, y: labels}) # --- begin backprop def f_df_wrapper(p): w, b = split_p(p) outs = f_df(w.astype(dtype), b.astype(dtype)) cost, grad = outs[0], form_p(outs[1:]) return cost.astype('float64'), grad.astype('float64') p0 = form_p([W0, b0]) p_opt, mincost, info = scipy.optimize.lbfgsb.fmin_l_bfgs_b( f_df_wrapper, p0, maxfun=n_epochs, iprint=1) self.W, self.b = split_p(p_opt)
def compute_loss(self, yc, y): if self.loss == 'nll': # compute negative log likelihood cost = -tt.mean(tt.log(tt.nnet.softmax(yc))[tt.arange(y.shape[0]), y]) error = tt.mean(tt.neq(tt.argmax(yc, axis=1), y)) elif self.loss == 'hinge': # compute hinge loss cost = multi_hinge_margin(yc, y).mean() error = tt.mean(tt.neq(tt.argmax(yc, axis=1), y)) else: raise ValueError("Unrecognized loss type '%s'" % self.loss) return cost, error
def compute_loss(self, yc, y): if self.loss == 'nll': # compute negative log likelihood cost = -tt.mean( tt.log(tt.nnet.softmax(yc))[tt.arange(y.shape[0]), y]) error = tt.mean(tt.neq(tt.argmax(yc, axis=1), y)) elif self.loss == 'hinge': # compute hinge loss cost = multi_hinge_margin(yc, y).mean() error = tt.mean(tt.neq(tt.argmax(yc, axis=1), y)) else: raise ValueError("Unrecognized loss type '%s'" % self.loss) return cost, error
def sgd(self, train_set, test_set, rate=0.1, tradeoff=0.5, n_epochs=30, batch_size=100): """Use SGD to do combined autoencoder and classifier training""" dtype = theano.config.floatX assert tradeoff >= 0 and tradeoff <= 1 params = [] for auto in self.autos: auto.V = theano.shared(auto.W.get_value(borrow=False).T, name='V') params.extend([auto.W, auto.V, auto.c, auto.b]) # --- compute backprop function assert self.W is not None and self.b is not None W = theano.shared(self.W.astype(dtype), name='Wc') b = theano.shared(self.b.astype(dtype), name='bc') x = tt.matrix('batch') y = tt.ivector('labels') xn = x # xn = x + self.theano_rng.normal(size=x.shape, std=0.1, dtype=dtype) yn = self.propup(xn, noise=1.0) # compute classification error # p_y_given_x = tt.nnet.softmax(tt.dot(yn, W) + b) # y_pred = tt.argmax(p_y_given_x, axis=1) # nll = -tt.mean(tt.log(p_y_given_x)[tt.arange(y.shape[0]), y]) # class_error = tt.mean(tt.neq(y_pred, y)) yc = tt.dot(yn, W) + b class_cost = multi_hinge_margin(yc, y).mean() class_error = tt.mean(tt.neq(tt.argmax(yc, axis=1), y)) # compute autoencoder error z = self.propdown(yn) rmses = tt.sqrt(tt.mean((x - z)**2, axis=1)) auto_cost = tt.mean(rmses) cost = (tt.cast(1 - tradeoff, dtype) * auto_cost + tt.cast(tradeoff, dtype) * class_cost) error = class_error # compute gradients grads = tt.grad(cost, params) updates = collections.OrderedDict() for param, grad in zip(params, grads): updates[param] = param - tt.cast(rate, dtype) * grad for auto in self.autos: if auto.mask is not None: updates[auto.W] = updates[auto.W] * auto.mask updates[auto.V] = updates[auto.V] * auto.mask.T train_dbn = theano.function([x, y], error, updates=updates) reconstruct = self.reconstruct # --- perform SGD images, labels = train_set ibatches = images.reshape(-1, batch_size, images.shape[1]) lbatches = labels.reshape(-1, batch_size).astype('int32') assert np.isfinite(ibatches).all() test_images, test_labels = test_set for epoch in range(n_epochs): costs = [] for batch, label in zip(ibatches, lbatches): costs.append(train_dbn(batch, label)) # copy back parameters (for test function) self.W = W.get_value() self.b = b.get_value() print "Epoch %d: %0.3f" % (epoch, np.mean(costs)) if test_images is not None: # plot reconstructions on test set plt.figure(2) plt.clf() recons = reconstruct(test_images) show_recons(test_images, recons) plt.draw() # plot filters for first layer only plt.figure(3) plt.clf() plotting.filters(self.autos[0].filters, rows=10, cols=20) plt.draw()