def test_vector_to_conv_c01b_invertible(): """ Tests that the format_as methods between Conv2DSpace and VectorSpace are invertible for the ('c', 0, 1, 'b') axis format. """ rng = np.random.RandomState([2013, 5, 1]) batch_size = 3 rows = 4 cols = 5 channels = 2 conv = Conv2DSpace([rows, cols], channels=channels, axes=('c', 0, 1, 'b')) vec = VectorSpace(conv.get_total_dimension()) X = conv.make_batch_theano() Y = conv.format_as(X, vec) Z = vec.format_as(Y, conv) A = vec.make_batch_theano() B = vec.format_as(A, conv) C = conv.format_as(B, vec) f = function([X, A], [Z, C]) X = rng.randn(*(conv.get_origin_batch(batch_size).shape)).astype(X.dtype) A = rng.randn(*(vec.get_origin_batch(batch_size).shape)).astype(A.dtype) Z, C = f(X, A) np.testing.assert_allclose(Z, X) np.testing.assert_allclose(C, A)
def test_vector_to_conv_c01b_invertible(): """ Tests that the format_as methods between Conv2DSpace and VectorSpace are invertible for the ('c', 0, 1, 'b') axis format. """ rng = np.random.RandomState([2013, 5, 1]) batch_size = 3 rows = 4 cols = 5 channels = 2 conv = Conv2DSpace([rows, cols], channels = channels, axes = ('c', 0, 1, 'b')) vec = VectorSpace(conv.get_total_dimension()) X = conv.make_batch_theano() Y = conv.format_as(X, vec) Z = vec.format_as(Y, conv) A = vec.make_batch_theano() B = vec.format_as(A, conv) C = conv.format_as(B, vec) f = function([X, A], [Z, C]) X = rng.randn(*(conv.get_origin_batch(batch_size).shape)).astype(X.dtype) A = rng.randn(*(vec.get_origin_batch(batch_size).shape)).astype(A.dtype) Z, C = f(X,A) np.testing.assert_allclose(Z, X) np.testing.assert_allclose(C, A)
class BinaryVectorMaxPool(HiddenLayer): """ A hidden layer that does max-pooling on binary vectors. It has two sublayers, the detector layer and the pooling layer. The detector layer is its downward state and the pooling layer is its upward state. TODO: this layer uses (pooled, detector) as its total state, which can be confusing when listing all the states in the network left to right. Change this and pylearn2.expr.probabilistic_max_pooling to use (detector, pooled) """ def __init__(self, detector_layer_dim, pool_size, layer_name, irange=None, sparse_init=None, include_prob=1.0, init_bias=0.): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.b = sharedX(np.zeros((self.detector_layer_dim, )) + init_bias, name=layer_name + '_b') def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) if not (self.detector_layer_dim % self.pool_size == 0): raise ValueError( "detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" % (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size)) self.h_space = VectorSpace(self.detector_layer_dim) self.pool_layer_dim = self.detector_layer_dim / self.pool_size self.output_space = VectorSpace(self.pool_layer_dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.detector_layer_dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.detector_layer_dim)) for i in xrange(self.detector_layer_dim): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W, = self.transformer.get_params() assert W.name is not None def get_total_state_space(self): return CompositeSpace((self.output_space, self.h_space)) def get_params(self): assert self.b.name is not None W, = self.transformer.get_params() assert W.name is not None return self.transformer.get_params().union([self.b]) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) W, = self.transformer.get_params() return coeff * T.sqr(W).sum() def get_weights(self): if self.requires_reformat: # This is not really an unimplemented case. # We actually don't know how to format the weights # in design space. We got the data in topo space # and we don't have access to the dataset raise NotImplementedError() W, = self.transformer.get_params() return W.get_value() def set_weights(self, weights): W, = self.transformer.get_params() W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def get_weights_view_shape(self): total = self.detector_layer_dim cols = self.pool_size if cols == 1: # Let the PatchViewer decidew how to arrange the units # when they're not pooled raise NotImplementedError() # When they are pooled, make each pooling unit have one row rows = total / cols return rows, cols def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W, = self.transformer.get_params() W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): p, h = total_state self.h_space.validate(h) self.output_space.validate(p) return p def downward_state(self, total_state): p, h = total_state return h def get_monitoring_channels_from_state(self, state): P, H = state rval = {} if self.pool_size == 1: vars_and_prefixes = [(P, '')] else: vars_and_prefixes = [(P, 'p_'), (H, 'h_')] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min for key, val in [('max_max', v_max.max()), ('max_mean', v_max.mean()), ('max_min', v_max.min()), ('min_max', v_min.max()), ('min_mean', v_min.mean()), ('min_max', v_min.max()), ('range_max', v_range.max()), ('range_mean', v_range.mean()), ('range_min', v_range.min()), ('mean_max', v_mean.max()), ('mean_mean', v_mean.mean()), ('mean_min', v_mean.min())]: rval[prefix + key] = val return rval def get_l1_act_cost(self, state, target, coeff, eps=None): rval = 0. P, H = state self.output_space.validate(P) self.h_space.validate(H) if self.pool_size == 1: # If the pool size is 1 then pools = detectors # and we should not penalize pools and detectors separately assert len(state) == 2 assert isinstance(target, float) assert isinstance(coeff, float) _, state = state state = [state] target = [target] coeff = [coeff] if eps is None: eps = [0.] else: eps = [eps] else: assert all([len(elem) == 2 for elem in [state, target, coeff]]) if eps is None: eps = [0., 0.] if target[1] < target[0]: warnings.warn( "Do you really want to regularize the detector units to be sparser than the pooling units?" ) for s, t, c, e in safe_zip(state, target, coeff, eps): assert all([isinstance(elem, float) for elem in [t, c, e]]) if c == 0.: continue m = s.mean(axis=0) assert m.ndim == 1 rval += T.maximum(abs(m - t) - e, 0.).mean() * c return rval def sample(self, state_below=None, state_above=None, layer_above=None, theano_rng=None): if theano_rng is None: raise ValueError( "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list." ) if state_above is not None: msg = layer_above.downward_message(state_above) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b p, h, p_sample, h_sample = max_pool_channels(z, self.pool_size, msg, theano_rng) return p_sample, h_sample def downward_message(self, downward_state): rval = self.transformer.lmul_T(downward_state) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.h_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16)) p_exp, h_exp, p_sample, h_sample = max_pool_channels( z=default_z, pool_size=self.pool_size, theano_rng=theano_rng) assert h_sample.dtype == default_z.dtype p_state = sharedX(self.output_space.get_origin_batch(num_examples)) t2 = time.time() f = function([], updates={p_state: p_sample, h_state: h_sample}) t3 = time.time() f() t4 = time.time() print str(self) + '.make_state took', t4 - t1 print '\tcompose time:', t2 - t1 print '\tcompile time:', t3 - t2 print '\texecute time:', t4 - t3 p_state.name = 'p_sample_shared' h_state.name = 'h_sample_shared' return p_state, h_state def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError( "self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) downward_state = self.downward_state(state) self.h_space.validate(downward_state) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(downward_state, self.b) weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def mf_update(self, state_below, state_above, layer_above=None, double_weights=False, iter_name=None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError( "self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_' + layer_above.layer_name + '_to_' + self.layer_name + '[' + iter_name + ']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_' + iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' p, h = max_pool_channels(z, self.pool_size, msg) p.name = self.layer_name + '_p_' + iter_name h.name = self.layer_name + '_h_' + iter_name return p, h
class Softmax(HiddenLayer): def __init__(self, n_classes, layer_name, irange=None, sparse_init=None, W_lr_scale=None): if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, int) self.output_space = VectorSpace(n_classes) self.b = sharedX(np.zeros((n_classes, )), name='softmax_b') def get_lr_scalers(self): rval = {} # Patch old pickle files if not hasattr(self, 'W_lr_scale'): self.W_lr_scale = None if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale return rval def get_total_state_space(self): return self.output_space def get_monitoring_channels_from_state(self, state): mx = state.max(axis=1) return { 'mean_max_class': mx.mean(), 'max_max_class': mx.max(), 'min_max_class': mx.min() } def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got " + str(space) + " of type " + str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) self.desired_space = VectorSpace(self.input_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.n_classes)) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'softmax_W') self._params = [self.b, self.W] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def sample(self, state_below=None, state_above=None, layer_above=None, theano_rng=None): if state_above is not None: # If you implement this case, also add a unit test for it. # Or at least add a warning that it is not tested. raise NotImplementedError() if theano_rng is None: raise ValueError( "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list." ) self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) z = T.dot(state_below, self.W) + self.b h_exp = T.nnet.softmax(z) h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype) return h_sample def mf_update(self, state_below, state_above=None, layer_above=None, double_weights=False, iter_name=None): if state_above is not None: raise NotImplementedError() if double_weights: raise NotImplementedError() self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) """ from pylearn2.utils import serial X = serial.load('/u/goodfeli/galatea/dbm/inpaint/expdir/cifar10_N3_interm_2_features.pkl') state_below = Verify(X,'features')(state_below) """ assert self.W.ndim == 2 assert state_below.ndim == 2 b = self.b Z = T.dot(state_below, self.W) + b #Z = Print('Z')(Z) rval = T.nnet.softmax(Z) return rval def downward_message(self, downward_state): rval = T.dot(downward_state, self.W.T) rval = self.desired_space.format_as(rval, self.input_space) return rval def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale): """ scale is because the visible layer also goes into the cost. it uses the mean over units and examples, so that the scale of the cost doesn't change too much with batch size or example size. we need to multiply this cost by scale to make sure that it is put on the same scale as the reconstruction cost for the visible units. ie, scale should be 1/nvis """ Y_hat = Y_hat_unmasked assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z, = owner.inputs assert z.ndim == 2 z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.exp(z).sum(axis=1).dimshuffle(0, 'x') # we use sum and not mean because this is really one variable per row log_prob_of = (Y * log_prob).sum(axis=1) masked = log_prob_of * drop_mask_Y assert masked.ndim == 1 rval = masked.mean() * scale return -rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.output_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16)) h_exp = T.nnet.softmax(default_z) h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype) p_state = sharedX(self.output_space.get_origin_batch(num_examples)) t2 = time.time() f = function([], updates={h_state: h_sample}) t3 = time.time() f() t4 = time.time() print str(self) + '.make_state took', t4 - t1 print '\tcompose time:', t2 - t1 print '\tcompile time:', t3 - t2 print '\texecute time:', t4 - t3 h_state.name = 'softmax_sample_shared' return h_state def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) return coeff * T.sqr(self.W).sum() def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(state, self.b) weights_term = (T.dot(state_below, self.W) * state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval
class BinaryVectorMaxPool(HiddenLayer): """ A hidden layer that does max-pooling on binary vectors. It has two sublayers, the detector layer and the pooling layer. The detector layer is its downward state and the pooling layer is its upward state. TODO: this layer uses (pooled, detector) as its total state, which can be confusing when listing all the states in the network left to right. Change this and pylearn2.expr.probabilistic_max_pooling to use (detector, pooled) """ def __init__(self, detector_layer_dim, pool_size, layer_name, irange = None, sparse_init = None, include_prob = 1.0, init_bias = 0.): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.b = sharedX( np.zeros((self.detector_layer_dim,)) + init_bias, name = layer_name + '_b') def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) if not (self.detector_layer_dim % self.pool_size == 0): raise ValueError("detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" % (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size)) self.h_space = VectorSpace(self.detector_layer_dim) self.pool_layer_dim = self.detector_layer_dim / self.pool_size self.output_space = VectorSpace(self.pool_layer_dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.detector_layer_dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.detector_layer_dim)) for i in xrange(self.detector_layer_dim): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W ,= self.transformer.get_params() assert W.name is not None def get_total_state_space(self): return CompositeSpace((self.output_space, self.h_space)) def get_params(self): assert self.b.name is not None W ,= self.transformer.get_params() assert W.name is not None return self.transformer.get_params().union([self.b]) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) W ,= self.transformer.get_params() return coeff * T.sqr(W).sum() def get_weights(self): if self.requires_reformat: # This is not really an unimplemented case. # We actually don't know how to format the weights # in design space. We got the data in topo space # and we don't have access to the dataset raise NotImplementedError() W ,= self.transformer.get_params() return W.get_value() def set_weights(self, weights): W, = self.transformer.get_params() W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def get_weights_view_shape(self): total = self.detector_layer_dim cols = self.pool_size if cols == 1: # Let the PatchViewer decidew how to arrange the units # when they're not pooled raise NotImplementedError() # When they are pooled, make each pooling unit have one row rows = total / cols return rows, cols def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W ,= self.transformer.get_params() W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): p,h = total_state self.h_space.validate(h) self.output_space.validate(p) return p def downward_state(self, total_state): p,h = total_state return h def get_monitoring_channels_from_state(self, state): P, H = state rval ={} if self.pool_size == 1: vars_and_prefixes = [ (P,'') ] else: vars_and_prefixes = [ (P, 'p_'), (H, 'h_') ] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min for key, val in [ ('max_max', v_max.max()), ('max_mean', v_max.mean()), ('max_min', v_max.min()), ('min_max', v_min.max()), ('min_mean', v_min.mean()), ('min_max', v_min.max()), ('range_max', v_range.max()), ('range_mean', v_range.mean()), ('range_min', v_range.min()), ('mean_max', v_mean.max()), ('mean_mean', v_mean.mean()), ('mean_min', v_mean.min()) ]: rval[prefix+key] = val return rval def get_l1_act_cost(self, state, target, coeff, eps = None): rval = 0. P, H = state self.output_space.validate(P) self.h_space.validate(H) if self.pool_size == 1: # If the pool size is 1 then pools = detectors # and we should not penalize pools and detectors separately assert len(state) == 2 assert isinstance(target, float) assert isinstance(coeff, float) _, state = state state = [state] target = [target] coeff = [coeff] if eps is None: eps = [0.] else: eps = [eps] else: assert all([len(elem) == 2 for elem in [state, target, coeff]]) if eps is None: eps = [0., 0.] if target[1] < target[0]: warnings.warn("Do you really want to regularize the detector units to be sparser than the pooling units?") for s, t, c, e in safe_zip(state, target, coeff, eps): assert all([isinstance(elem, float) for elem in [t, c, e]]) if c == 0.: continue m = s.mean(axis=0) assert m.ndim == 1 rval += T.maximum(abs(m-t)-e,0.).mean()*c return rval def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): if theano_rng is None: raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.") if state_above is not None: msg = layer_above.downward_message(state_above) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b p, h, p_sample, h_sample = max_pool_channels(z, self.pool_size, msg, theano_rng) return p_sample, h_sample def downward_message(self, downward_state): rval = self.transformer.lmul_T(downward_state) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.h_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16)) p_exp, h_exp, p_sample, h_sample = max_pool_channels( z = default_z, pool_size = self.pool_size, theano_rng = theano_rng) assert h_sample.dtype == default_z.dtype p_state = sharedX( self.output_space.get_origin_batch( num_examples)) t2 = time.time() f = function([], updates = { p_state : p_sample, h_state : h_sample }) t3 = time.time() f() t4 = time.time() print str(self)+'.make_state took',t4-t1 print '\tcompose time:',t2-t1 print '\tcompile time:',t3-t2 print '\texecute time:',t4-t3 p_state.name = 'p_sample_shared' h_state.name = 'h_sample_shared' return p_state, h_state def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) downward_state = self.downward_state(state) self.h_space.validate(downward_state) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(downward_state, self.b) weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_'+iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' p,h = max_pool_channels(z, self.pool_size, msg) p.name = self.layer_name + '_p_' + iter_name h.name = self.layer_name + '_h_' + iter_name return p, h
class Softmax(HiddenLayer): def __init__(self, n_classes, layer_name, irange = None, sparse_init = None, W_lr_scale = None): if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, int) self.output_space = VectorSpace(n_classes) self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b') def get_lr_scalers(self): rval = {} # Patch old pickle files if not hasattr(self, 'W_lr_scale'): self.W_lr_scale = None if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale return rval def get_total_state_space(self): return self.output_space def get_monitoring_channels_from_state(self, state): mx = state.max(axis=1) return { 'mean_max_class' : mx.mean(), 'max_max_class' : mx.max(), 'min_max_class' : mx.min() } def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) self.desired_space = VectorSpace(self.input_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes)) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'softmax_W' ) self._params = [ self.b, self.W ] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): if state_above is not None: # If you implement this case, also add a unit test for it. # Or at least add a warning that it is not tested. raise NotImplementedError() if theano_rng is None: raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.") self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) z = T.dot(state_below, self.W) + self.b h_exp = T.nnet.softmax(z) h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype) return h_sample def mf_update(self, state_below, state_above = None, layer_above = None, double_weights = False, iter_name = None): if state_above is not None: raise NotImplementedError() if double_weights: raise NotImplementedError() self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) """ from pylearn2.utils import serial X = serial.load('/u/goodfeli/galatea/dbm/inpaint/expdir/cifar10_N3_interm_2_features.pkl') state_below = Verify(X,'features')(state_below) """ assert self.W.ndim == 2 assert state_below.ndim == 2 b = self.b Z = T.dot(state_below, self.W) + b #Z = Print('Z')(Z) rval = T.nnet.softmax(Z) return rval def downward_message(self, downward_state): rval = T.dot(downward_state, self.W.T) rval = self.desired_space.format_as(rval, self.input_space) return rval def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale): """ scale is because the visible layer also goes into the cost. it uses the mean over units and examples, so that the scale of the cost doesn't change too much with batch size or example size. we need to multiply this cost by scale to make sure that it is put on the same scale as the reconstruction cost for the visible units. ie, scale should be 1/nvis """ Y_hat = Y_hat_unmasked assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z ,= owner.inputs assert z.ndim == 2 z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.exp(z).sum(axis=1).dimshuffle(0, 'x') # we use sum and not mean because this is really one variable per row log_prob_of = (Y * log_prob).sum(axis=1) masked = log_prob_of * drop_mask_Y assert masked.ndim == 1 rval = masked.mean() * scale return - rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.output_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16)) h_exp = T.nnet.softmax(default_z) h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype) p_state = sharedX( self.output_space.get_origin_batch( num_examples)) t2 = time.time() f = function([], updates = { h_state : h_sample }) t3 = time.time() f() t4 = time.time() print str(self)+'.make_state took',t4-t1 print '\tcompose time:',t2-t1 print '\tcompile time:',t3-t2 print '\texecute time:',t4-t3 h_state.name = 'softmax_sample_shared' return h_state def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) return coeff * T.sqr(self.W).sum() def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(state, self.b) weights_term = (T.dot(state_below, self.W) * state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval