def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[ 0] != self.mlp.batch_size: raise ValueError("state_below should have batch size " + str(self.dbm.batch_size) + " but has " + str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 if not hasattr(self, 'no_affine'): self.no_affine = False if self.no_affine: rval = state_below else: assert self.W.ndim == 2 b = self.b W = self.W rval = T.dot(state_below, W) + b for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval
def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[ 0] != self.mlp.batch_size: raise ValueError("state_below should have batch size " + str(self.dbm.batch_size) + " but has " + str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 assert self.W.ndim == 3 Z = T.tensordot(state_below, self.W, axes=[[1], [0]]) + self.b rval = batched_softmax(Z) for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval
def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 if not hasattr(self, 'no_affine'): self.no_affine = False if self.no_affine: rval = state_below else: assert self.W.ndim == 2 b = self.b W = self.W rval = T.dot(state_below, W) + b for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval
def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 W = T.dot(self.V, self.U) assert W.ndim == 2 Z = T.dot(state_below, W.T) rval = Z for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return (rval, state_below)
def entropy_h(self, H_hat): """ .. todo:: WRITEME properly entropy of the hidden layers under the mean field distribution defined by H_hat """ for Hv in get_debug_values(H_hat[0]): assert Hv.min() >= 0.0 assert Hv.max() <= 1.0 total = entropy_binary_vector(H_hat[0]) for H in H_hat[1:]: for Hv in get_debug_values(H): assert Hv.min() >= 0.0 assert Hv.max() <= 1.0 total += entropy_binary_vector(H) return total
def fprop(self, state_below,targets): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 if not hasattr(self, 'no_affine'): self.no_affine = False if self.no_affine: raise NotImplementedError() assert self.W_class.ndim == 3 assert self.W_cluster.ndim == 2 #we get the cluster by doing hW_cluster + b_cluster probcluster = T.dot(state_below, self.W_cluster) + self.b_cluster probcluster = T.nnet.softmax(probcluster) #check this line again batch_clusters = self.array_clusters[T.cast(T.argmax(targets).flatten(),'int32')] Z = T.nnet.GroupDot(self.n_clusters)(state_below, self.W_class, self.b_class, T.cast(batch_clusters,'int32')) probclass = T.nnet.softmax(Z) for value in get_debug_values(probclass): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return probclass, probcluster
def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert self.W.ndim == 2 assert state_below.ndim == 2 b = self.b Z = T.dot(state_below, self.W) + b rval = T.nnet.softmax(Z) for value in get_debug_values(rval): assert value.shape[0] == self.mlp.batch_size return rval
def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 assert self.W.ndim == 3 Z = T.tensordot(state_below, self.W, axes=[[1],[0]]) + self.b rval = batched_softmax(Z) for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval
def lrn_same_map(c01b,size,pow,scale,image_side): mx = None for c01bv in get_debug_values(c01b): assert not np.any(np.isinf(c01bv)) assert c01bv.shape[1] == image_side assert c01bv.shape[2] == image_side new_side = size-1+image_side wide_infinity = T.alloc(0.0, c01b.shape[0], new_side, new_side, c01b.shape[3]) c01b_pad = T.set_subtensor(wide_infinity[:, 1:1+image_side, 1:1+image_side, :], T.sqr(c01b)) wide_infinity_count = T.alloc(0, c01b.shape[0], new_side, new_side,c01b.shape[3]) c01b_count = T.set_subtensor(wide_infinity_count[:, 1:1+image_side, 1:1+image_side, :], 1) for row_within_pool in xrange(size): row_stop = image_side + row_within_pool for col_within_pool in xrange(size): col_stop = image_side + col_within_pool cur = c01b_pad[:, row_within_pool:row_stop:1, col_within_pool:col_stop:1, :] cur_count = c01b_count[:, row_within_pool:row_stop:1, col_within_pool:col_stop:1, :] if mx is None: mx = cur count = cur_count else: mx = mx + cur count = count + cur_count mx /= count mx = scale*mx mx = mx+1 for mxv in get_debug_values(mx): assert not np.any(np.isnan(mxv)) assert not np.any(np.isinf(mxv)) new_c01b = c01b/T.pow(mx,pow) return new_c01b
def _validate_impl(self, is_numeric, batch): # checks that batch isn't a tuple, checks batch.type against self.dtype super(IndexSequenceSpace, self)._validate_impl(is_numeric, batch) if is_numeric: # Use the 'CudaNdarray' string to avoid importing # theano.sandbox.cuda when it is not available if not isinstance(batch, np.ndarray) \ and str(type(batch)) != "<type 'CudaNdarray'>": raise TypeError("The value of a IndexSequenceSpace batch " "should be a numpy.ndarray, or CudaNdarray, " "but is %s." % str(type(batch))) if batch.ndim != 2: raise ValueError("The value of a IndexSequenceSpace batch " "must be 2D, got %d dimensions for %s." % (batch.ndim, batch)) if batch.shape[1] != self.dim: raise ValueError("The width of a IndexSequenceSpace batch " "must match with the space's dimension, but " "batch has shape %s and dim = %d." % (str(batch.shape), self.dim)) else: if not isinstance(batch, theano.gof.Variable): raise TypeError("IndexSequenceSpace batch should be a theano " "Variable, got " + str(type(batch))) if not isinstance(batch.type, (theano.tensor.TensorType, CudaNdarrayType)): raise TypeError("IndexSequenceSpace batch should be " "TensorType or CudaNdarrayType, got " + str(batch.type)) if batch.ndim != 2: raise ValueError('IndexSequenceSpace batches must be 2D, got ' '%d dimensions' % batch.ndim) for val in get_debug_values(batch): self.np_validate(val)
def entropy_binary_vector(P): """ if P[i,j] represents the probability of some binary random variable X[i,j] being 1 then rval[i] gives the entropy of the random vector X[i,:] """ oneMinusP = 1.-P PlogP = xlogx(P) omPlogOmP = xlogx(oneMinusP) term1 = - T.sum( PlogP , axis=1) assert len(term1.type.broadcastable) == 1 term2 = - T.sum( omPlogOmP , axis =1 ) assert len(term2.type.broadcastable) == 1 rval = term1 + term2 for plp, olo, t1, t2, rv in get_debug_values(PlogP, omPlogOmP, term1, term2, rval): debug_assert(not np.any(np.isnan(plp))) debug_assert(not np.any(np.isinf(olo))) debug_assert(not np.any(np.isnan(plp))) debug_assert(not np.any(np.isinf(olo))) debug_assert(not np.any(np.isnan(t1))) debug_assert(not np.any(np.isnan(t2))) debug_assert(not np.any(np.isnan(rv))) return rval
def validate(self, batch): if not isinstance(batch, theano.gof.Variable): raise TypeError( "Conv2DSpace batches must be theano Variables, got " + str(type(batch))) if not isinstance(batch.type, (theano.tensor.TensorType, CudaNdarrayType)): raise TypeError() if batch.ndim != 4: raise ValueError() for val in get_debug_values(batch): d = self.axes.index('c') actual_channels = val.shape[d] if actual_channels != self.num_channels: raise ValueError("Expected axis "+str(d)+" to be number of channels ("+str(self.num_channels)+\ ") but it is "+str(actual_channels)) assert val.shape[self.axes.index('c')] == self.num_channels for coord in [0, 1]: d = self.axes.index(coord) actual_shape = val.shape[d] expected_shape = self.shape[coord] if actual_shape != expected_shape: raise ValueError("Conv2DSpace with shape "+str(self.shape) + \ " and axes " + str(self.axes) + " expected dimension " + \ str(d) + " of a batch (" + str(batch)+") to have length " + str(expected_shape) + \ " but it has "+str(actual_shape))
def test_get_debug_values_exc(): """tests that get_debug_value raises an exception when debugger is set to raise and a value is missing """ prev_value = config.compute_test_value try: config.compute_test_value = 'raise' x = T.vector() try: for x_val in op.get_debug_values(x): # this assert catches the case where we # erroneously get a value returned assert False raised = False except AttributeError: raised = True # this assert catches the case where we got [] # returned, and possibly issued a warning, # rather than raising an exception assert raised finally: config.compute_test_value = prev_value
def test_kl(): """ Test whether function kl() has properly processed the input. """ init_mode = theano.config.compute_test_value theano.config.compute_test_value = 'raise' try: mlp = MLP(layers=[Sigmoid(dim=10, layer_name='Y', irange=0.1)], nvis=10) X = mlp.get_input_space().make_theano_batch() Y = mlp.get_output_space().make_theano_batch() X.tag.test_value = np.random.random( get_debug_values(X)[0].shape).astype(theano.config.floatX) Y_hat = mlp.fprop(X) # This call should not raise any error: ave = kl(Y, Y_hat, 1) # The following calls should raise ValueError exceptions: Y.tag.test_value[2][3] = 1.1 np.testing.assert_raises(ValueError, kl, Y, Y_hat, 1) Y.tag.test_value[2][3] = -0.1 np.testing.assert_raises(ValueError, kl, Y, Y_hat, 1) finally: theano.config.compute_test_value = init_mode
def expand_2d(b01c, expand_shape, expand_stride, image_shape): for b01cv in get_debug_values(b01c): assert not np.any(np.isinf(b01cv)) assert b01cv.shape[1] == image_shape[0] assert b01cv.shape[2] == image_shape[1] assert b01cv.shape[3] == np.prod(expand_shape) for i in range(len(expand_shape)): assert expand_shape[i] % expand_stride[i] ==0 b0101 = b01c.reshape((b01c.shape[0], image_shape[0], image_shape[1], expand_shape[0], expand_shape[1])) required_r = (image_shape[0] - 1) * expand_stride[0] + expand_shape[0] required_c = (image_shape[1] - 1) * expand_stride[1] + expand_shape[1] wide_b01 = T.alloc(0., b01c.shape[0], required_r, required_c) for row_within_expand in xrange(expand_shape[0]): row_stop = (image_shape[0] - 1) * expand_stride[0] + \ row_within_expand + 1 for col_within_expand in xrange(expand_shape[1]): col_stop = (image_shape[1] - 1) * expand_stride[1] + \ col_within_expand + 1 wide_b01 = T.inc_subtensor(wide_b01[:, row_within_expand:row_stop:expand_stride[0], col_within_expand:col_stop:expand_stride[1]], b0101[:,:,:,row_within_expand, col_within_expand]) wide_b01 = wide_b01 / (expand_shape[0] / expand_stride[0]) ** 2 wide_b01c = wide_b01.reshape((b01c.shape[0], required_r, required_c, 1)) return wide_b01c
def expected_energy_term(self, state, average, state_below, average_below): # state = Print('h_state', attrs=['min', 'max'])(state) self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(state, self.b) weights_term = (self.transformer.lmul(state_below) * state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval
def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError( "self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) downward_state = self.downward_state(state) self.h_space.validate(downward_state) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(downward_state, self.b) weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval
def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_'+iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' if msg is not None: z = z + msg h = T.tanh(z) return h
def kl(Y, Y_hat, batch_axis): """ Warning: This function expects a sigmoid nonlinearity in the output layer. Returns a batch (vector) of mean across units of KL divergence for each example, KL(P || Q) where P is defined by Y and Q is defined by Y_hat: p log p - p log q + (1-p) log (1-p) - (1-p) log (1-q) For binary p, some terms drop out: - p log q - (1-p) log (1-q) - p log sigmoid(z) - (1-p) log sigmoid(-z) p softplus(-z) + (1-p) softplus(z) Parameters ---------- Y : Variable targets for the sigmoid outputs. Currently Y must be purely binary. If it's not, you'll still get the right gradient, but the value in the monitoring channel will be wrong. Y_hat : Variable predictions made by the sigmoid layer. Y_hat must be generated by fprop, i.e., it must be a symbolic sigmoid. batch_axis : list list of axes to compute average kl divergence across. Returns ------- ave : Variable average kl divergence between Y and Y_hat. """ assert hasattr(Y_hat, 'owner') assert batch_axis is not None owner = Y_hat.owner assert owner is not None op = owner.op if not hasattr(op, 'scalar_op'): raise ValueError("Expected Y_hat to be generated by an Elemwise " "op, got "+str(op)+" of type "+str(type(op))) assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid) for Yv in get_debug_values(Y): if not (Yv.min() >= 0.0 and Yv.max() <= 1.0): raise ValueError("Expected Y to be between 0 and 1. Either Y" + "< 0 or Y > 1 was found in the input.") z, = owner.inputs term_1 = Y * T.nnet.softplus(-z) term_2 = (1 - Y) * T.nnet.softplus(z) total = term_1 + term_2 naxes = total.ndim axes_to_reduce = list(range(naxes)) del axes_to_reduce[batch_axis] ave = total.mean(axis=axes_to_reduce) return ave
def test_get_debug_values_success(): """tests that get_debug_value returns values when available (and the debugger is on)""" prev_value = config.compute_test_value for mode in ['ignore', 'warn', 'raise']: try: config.compute_test_value = mode x = T.vector() x.tag.test_value = numpy.zeros((4,), dtype=config.floatX) y = numpy.zeros((5, 5)) iters = 0 for x_val, y_val in op.get_debug_values(x, y): assert x_val.shape == (4,) assert y_val.shape == (5, 5) iters += 1 assert iters == 1 finally: config.compute_test_value = prev_value
def truncated_KL(self, V, obs, Y=None, no_v_bias=False): """ KL divergence between variation and true posterior, dropping terms that don't depend on the variational parameters if no_v_bias is True, ignores the contribution of the visible biases to the expected energy """ """ D_KL ( Q(h ) || P(h | v) ) = - sum_h Q(h) log P(h | v) + sum_h Q(h) log Q(h) = -sum_h Q(h) log P( h, v) + sum_h Q(h) log P(v) + sum_h Q(h) log Q(h) <truncated version> = -sum_h Q(h) log P( h, v) + sum_h Q(h) log Q(h) = -sum_h Q(h) log exp( -E (h,v)) + sum_h Q(h) log Z + sum_H Q(h) log Q(h) <truncated version> = sum_h Q(h) E(h, v) + sum_h Q(h) log Q(h) this comment was written before adding support for Y """ H_hat = obs['H_hat'] for Hv in get_debug_values(H_hat): assert Hv.min() >= 0.0 assert Hv.max() <= 1.0 entropy_term = -self.model.entropy_h(H_hat=H_hat) assert len(entropy_term.type.broadcastable) == 1 energy_term = self.model.expected_energy_batch(V_hat=V, H_hat=H_hat, Y_hat=Y, no_v_bias=no_v_bias) assert len(energy_term.type.broadcastable) == 1 KL = entropy_term + energy_term return KL
def expand_2d(b01c, expand_shape, expand_stride, image_shape): for b01cv in get_debug_values(b01c): assert not np.any(np.isinf(b01cv)) assert b01cv.shape[1] == image_shape[0] assert b01cv.shape[2] == image_shape[1] assert b01cv.shape[3] == np.prod(expand_shape) for i in range(len(expand_shape)): assert expand_shape[i] % expand_stride[i] == 0 b0101 = b01c.reshape((b01c.shape[0], image_shape[0], image_shape[1], expand_shape[0], expand_shape[1])) required_r = (image_shape[0] - 1) * expand_stride[0] + expand_shape[0] required_c = (image_shape[1] - 1) * expand_stride[1] + expand_shape[1] wide_b01 = T.alloc(0., b01c.shape[0], required_r, required_c) for row_within_expand in xrange(expand_shape[0]): row_stop = (image_shape[0] - 1) * expand_stride[0] + \ row_within_expand + 1 for col_within_expand in xrange(expand_shape[1]): col_stop = (image_shape[1] - 1) * expand_stride[1] + \ col_within_expand + 1 wide_b01 = T.inc_subtensor( wide_b01[:, row_within_expand:row_stop:expand_stride[0], col_within_expand:col_stop:expand_stride[1]], b0101[:, :, :, row_within_expand, col_within_expand]) wide_b01 = wide_b01 / (expand_shape[0] / expand_stride[0])**2 wide_b01c = wide_b01.reshape((b01c.shape[0], required_r, required_c, 1)) return wide_b01c
def truncated_KL(self, V, obs, no_v_bias = False): """ KL divergence between variation and true posterior, dropping terms that don't depend on the variational parameters if no_v_bias is True, ignores the contribution of the visible biases to the expected energy """ """ D_KL ( Q(h ) || P(h | v) ) = - sum_h Q(h) log P(h | v) + sum_h Q(h) log Q(h) = -sum_h Q(h) log P( h, v) + sum_h Q(h) log P(v) + sum_h Q(h) log Q(h) <truncated version> = -sum_h Q(h) log P( h, v) + sum_h Q(h) log Q(h) = -sum_h Q(h) log exp( -E (h,v)) + sum_h Q(h) log Z + sum_H Q(h) log Q(h) <truncated version> = sum_h Q(h) E(h, v) + sum_h Q(h) log Q(h) """ H_hat = obs['H_hat'] for Hv in get_debug_values(H_hat): assert Hv.min() >= 0.0 assert Hv.max() <= 1.0 entropy_term = - self.model.entropy_h(H_hat = H_hat) assert len(entropy_term.type.broadcastable) == 1 energy_term = self.model.expected_energy_batch(V_hat = V, H_hat = H_hat, no_v_bias = no_v_bias) assert len(energy_term.type.broadcastable) == 1 KL = entropy_term + energy_term return KL
def kl(Y, Y_hat, batch_axis): """ Warning: This function expects a sigmoid nonlinearity in the output layer. Returns a batch (vector) of mean across units of KL divergence for each example, KL(P || Q) where P is defined by Y and Q is defined by Y_hat: p log p - p log q + (1-p) log (1-p) - (1-p) log (1-q) For binary p, some terms drop out: - p log q - (1-p) log (1-q) - p log sigmoid(z) - (1-p) log sigmoid(-z) p softplus(-z) + (1-p) softplus(z) Parameters ---------- Y : Variable targets for the sigmoid outputs. Currently Y must be purely binary. If it's not, you'll still get the right gradient, but the value in the monitoring channel will be wrong. Y_hat : Variable predictions made by the sigmoid layer. Y_hat must be generated by fprop, i.e., it must be a symbolic sigmoid. batch_axis : list list of axes to compute average kl divergence across. Returns ------- ave : Variable average kl divergence between Y and Y_hat. """ assert hasattr(Y_hat, 'owner') assert batch_axis is not None owner = Y_hat.owner assert owner is not None op = owner.op if not hasattr(op, 'scalar_op'): raise ValueError("Expected Y_hat to be generated by an Elemwise " "op, got " + str(op) + " of type " + str(type(op))) assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid) for Yv in get_debug_values(Y): if not (Yv.min() >= 0.0 and Yv.max() <= 1.0): raise ValueError("Expected Y to be between 0 and 1. Either Y" + "< 0 or Y > 1 was found in the input.") z, = owner.inputs term_1 = Y * T.nnet.softplus(-z) term_2 = (1 - Y) * T.nnet.softplus(z) total = term_1 + term_2 naxes = total.ndim axes_to_reduce = range(naxes) del axes_to_reduce[batch_axis] ave = total.mean(axis=axes_to_reduce) return ave
def fprop(self, state_below, add_noise=True): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) self.x = state_below # linear part if isinstance(self.x, S.SparseVariable): z = S.dot(self.x,self.W[0]) + self.b[0] else: z = T.dot(self.x,self.W[0]) + self.b[0] self.z = self.activate(z, self.expert_activation) # first layer non-linear part if isinstance(self.x, S.SparseVariable): h = S.dot(self.x,self.W[1]) + self.b[1] else: h = T.dot(self.x,self.W[1]) + self.b[1] # activate hidden units of non-linear part self.h = self.activate(h, self.hidden_activation) noise = 0. if add_noise: rng = MRG_RandomStreams(self.mlp.rng.randint(2**15)) noise = rng.normal(size = self.z.shape, std=self.noise_stdev , dtype=self.z.type.dtype) # second layer non-linear part self.a = T.dot(self.h,self.W[2]) + self.b[2] + noise # activate non-linear part self.m_mean = self.activate(self.a, self.gater_activation) # how many are over 0: self.effective_sparsity = T.cast(T.gt(self.m_mean, 0), theano.config.floatX).mean() # mix output of linear part with output of non-linear part self.p = self.m_mean * self.z if self.layer_name is not None: self.z.name = self.layer_name + '_z' self.h.name = self.layer_name + '_h' self.a.name = self.layer_name + '_a' self.m_mean.name = self.layer_name + '_m_mean' self.p.name = self.layer_name + '_p' return self.p
def fprop(self, state_below): #change model to add new variable which sends which indices of the data are here self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): print 'getting debug values' print value # if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: # raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 if not hasattr(self, 'no_affine'): self.no_affine = False if self.no_affine: raise NotImplementedError() assert self.W_class.ndim == 3 assert self.W_cluster.ndim == 2 #we get the cluster by doing hW_cluster + b_cluster probcluster = T.dot(state_below, self.W_cluster) + self.b_cluster probcluster = T.nnet.softmax(probcluster) for value in get_debug_values(probcluster): print 'val is' print val print 'type of state below is' print state_below.type print state_below.dtype print state_below.ndim self.cluster_targets = range(5) #need the predicted clusters for this batch Z = T.nnet.GroupDot(self.n_clusters)(state_below, self.W_class, self.b_class, self.cluster_targets) probclass = T.nnet.softmax(Z) for value in get_debug_values(probclass): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return probclass, probcluster
def validate(self, batch): if not isinstance(batch, theano.gof.Variable): raise TypeError("Conv3DSpace batches must be theano Variables, got "+str(type(batch))) if not isinstance(batch.type, (theano.tensor.TensorType,CudaNdarrayType)): raise TypeError() if batch.ndim != 5: raise ValueError() for val in get_debug_values(batch): self.np_validate(val)
def validate(self, batch): if not isinstance(batch, theano.gof.Variable): raise TypeError("Conv2DSpace batches must be theano Variables, got "+str(type(batch))) if not isinstance(batch.type, (theano.tensor.TensorType,CudaNdarrayType)): raise TypeError() if batch.ndim != 4: raise ValueError() for val in get_debug_values(batch): self.np_validate(val)
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=NotImplementedError): if self.no_affine: return OrderedDict() W_class = self.W_class W_cluster = self.W_cluster assert W_class.ndim == 3 assert W_cluster.ndim == 2 sq_W = T.sqr(W_cluster) sq_W_class = T.sqr(W_class) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) row_norms_class = T.sqrt(sq_W_class.sum(axis=1)) col_norms_class = T.sqrt(sq_W_class.sum(axis=0)) rval = OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ('class_row_norms_min' , row_norms_class.min()), ('class_row_norms_mean' , row_norms_class.mean()), ('class_row_norms_max' , row_norms_class.max()), ('class_col_norms_min' , col_norms_class.min()), ('class_col_norms_mean' , col_norms_class.mean()), ('class_col_norms_max' , col_norms_class.max()), ]) if (state_below is not None) or (state is not None): if state is None: for value in get_debug_values(state_below): print 'value is'+ value state=self.fprop (state_below) #print state state, cls = state mx = state.max(axis=1) rval.update(OrderedDict([('mean_max_class',mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ])) if targets is not None: rval['nll'] = self.cost(Y_hat=(state,cls), Y=targets) rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32')) rval['entropy'] = rval['nll']/np.log(2).astype('float32') return rval
def validate(self, batch): if not isinstance(batch, theano.gof.Variable): raise TypeError() if not isinstance(batch.type, (theano.tensor.TensorType,CudaNdarrayType)): raise TypeError() if batch.ndim != 4: raise ValueError() for val in get_debug_values(batch): assert val.shape[self.axes.index('c')] == self.nchannels for coord in [0,1]: assert val.shape[self.axes.index(coord)] == self.shape[coord]
def validate(self, batch): if not isinstance(batch, theano.gof.Variable): raise TypeError("VectorSpace batch should be a theano Variable, got "+str(type(batch))) if not self.sparse and not isinstance(batch.type, (theano.tensor.TensorType, CudaNdarrayType)): raise TypeError("VectorSpace batch should be TensorType or CudaNdarrayType, got "+str(batch.type)) if self.sparse and not isinstance(batch.type, theano.sparse.SparseType): raise TypeError() if batch.ndim != 2: raise ValueError('VectorSpace batches must be 2D, got %d dimensions' % batch.ndim) for val in get_debug_values(batch): self.np_validate(val)
def entropy_binary_vector(P): """ .. todo:: WRITEME properly if P[i,j] represents the probability of some binary random variable X[i,j] being 1 then rval[i] gives the entropy of the random vector X[i,:] """ for Pv in get_debug_values(P): assert Pv.min() >= 0.0 assert Pv.max() <= 1.0 oneMinusP = 1. - P PlogP = xlogx(P) omPlogOmP = xlogx(oneMinusP) term1 = -T.sum(PlogP, axis=1) assert len(term1.type.broadcastable) == 1 term2 = -T.sum(omPlogOmP, axis=1) assert len(term2.type.broadcastable) == 1 rval = term1 + term2 for plp, olo, t1, t2, rv in get_debug_values(PlogP, omPlogOmP, term1, term2, rval): debug_assert(not np.any(np.isnan(plp))) debug_assert(not np.any(np.isinf(olo))) debug_assert(not np.any(np.isnan(plp))) debug_assert(not np.any(np.isinf(olo))) debug_assert(not np.any(np.isnan(t1))) debug_assert(not np.any(np.isnan(t2))) debug_assert(not np.any(np.isnan(rv))) return rval
def validate(self, batch): if not isinstance(batch, theano.gof.Variable): raise TypeError() if not isinstance(batch.type, (theano.tensor.TensorType, CudaNdarrayType)): raise TypeError() if batch.ndim != 4: raise ValueError() for val in get_debug_values(batch): assert val.shape[self.axes.index('c')] == self.nchannels for coord in [0, 1]: assert val.shape[self.axes.index(coord)] == self.shape[coord]
def validate(self, batch): if not isinstance(batch, theano.gof.Variable): raise TypeError("%s batches must be Theano Variables, got %s" % (str(type(self)), str(type(batch)))) if not isinstance(batch.type, (theano.tensor.TensorType, CudaNdarrayType)): raise TypeError() if batch.ndim != 5: raise ValueError() if not batch.broadcastable[self.axes.index("b")]: raise ValueError( "%s batches should be broadcastable along the " "'b' (batch size) dimension." % str(type(self)) ) for val in get_debug_values(batch): self.np_validate(val)
def foo(self, state_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError( "self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]) ) assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b if not hasattr(self, "randomize_pools"): self.randomize_pools = False if not hasattr(self, "pool_stride"): self.pool_stride = self.pool_size if self.randomize_pools: z = T.dot(z, self.permute) if not hasattr(self, "min_zero"): self.min_zero = False if self.min_zero: p = T.zeros_like(z) else: p = None last_start = self.detector_layer_dim - self.pool_size pooling_stack = [] for i in xrange(self.pool_size): cur = z[:, i : last_start + i + 1 : self.pool_stride] cur = cur.reshape((cur.shape[0], cur.shape[1], 1)) assert cur.ndim == 3 pooling_stack.append(cur) if self.min_zero: pooling_stack.append(T.zeros_like(cur)) pooling_stack = T.concatenate(pooling_stack, axis=2) p = pooling_stack.max(axis=2) counts = (T.eq(pooling_stack, p.dimshuffle(0, 1, "x"))).sum(axis=0) p.name = self.layer_name + "_p_" return p, counts
def validate(self, batch): if not isinstance(batch, theano.gof.Variable): raise TypeError("%s batches must be Theano Variables, got %s" % (str(type(self)), str(type(batch)))) if not isinstance(batch.type, (theano.tensor.TensorType, CudaNdarrayType)): raise TypeError() if batch.ndim != 5: raise ValueError() if not batch.broadcastable[self.axes.index('b')]: raise ValueError("%s batches should be broadcastable along the " "'b' (batch size) dimension." % str(type(self))) for val in get_debug_values(batch): self.np_validate(val)
def test_get_debug_values_no_debugger(): 'get_debug_values should return [] when debugger is off' prev_value = config.compute_test_value try: config.compute_test_value = 'off' x = T.vector() for x_val in op.get_debug_values(x): assert False finally: config.compute_test_value = prev_value
def entropy_binary_vector(P): """ .. todo:: WRITEME properly If P[i,j] represents the probability of some binary random variable X[i,j] being 1, then rval[i] gives the entropy of the random vector X[i,:] """ for Pv in get_debug_values(P): assert Pv.min() >= 0.0 assert Pv.max() <= 1.0 oneMinusP = 1. - P PlogP = xlogx(P) omPlogOmP = xlogx(oneMinusP) term1 = - T.sum(PlogP, axis=1) assert len(term1.type.broadcastable) == 1 term2 = - T.sum(omPlogOmP, axis=1) assert len(term2.type.broadcastable) == 1 rval = term1 + term2 debug_vals = get_debug_values(PlogP, omPlogOmP, term1, term2, rval) for plp, olo, t1, t2, rv in debug_vals: debug_assert(isfinite(plp)) debug_assert(isfinite(olo)) debug_assert(not contains_nan(t1)) debug_assert(not contains_nan(t2)) debug_assert(not contains_nan(rv)) return rval
def foo(self, state_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b if not hasattr(self, 'randomize_pools'): self.randomize_pools = False if not hasattr(self, 'pool_stride'): self.pool_stride = self.pool_size if self.randomize_pools: z = T.dot(z, self.permute) if not hasattr(self, 'min_zero'): self.min_zero = False if self.min_zero: p = T.zeros_like(z) else: p = None last_start = self.detector_layer_dim - self.pool_size pooling_stack = [] for i in xrange(self.pool_size): cur = z[:,i:last_start+i+1:self.pool_stride] cur = cur.reshape((cur.shape[0], cur.shape[1], 1)) assert cur.ndim == 3 pooling_stack.append(cur) if self.min_zero: pooling_stack.append(T.zeros_like(cur)) pooling_stack = T.concatenate(pooling_stack, axis=2) p = pooling_stack.max(axis=2) counts = (T.eq(pooling_stack, p.dimshuffle(0, 1, 'x'))).sum(axis=0) p.name = self.layer_name + '_p_' return p, counts
def init_H_hat(self, V): if self.model.recycle_q: rval = self.model.prev_H if config.compute_test_value != 'off': if rval.get_value().shape[0] != V.tag.test_value.shape[0]: raise Exception('E step given wrong test batch size', rval.get_value().shape, V.tag.test_value.shape) else: rval = T.alloc(1., V.shape[0], self.model.nhid) for rval_value, V_value in get_debug_values(rval, V): if rval_value.shape[0] != V_value.shape[0]: debug_error_message("rval.shape = %s, V.shape = %s, element 0 should match but doesn't", str(rval_value.shape), str(V_value.shape)) return rval
def _validate_impl(self, is_numeric, batch): super(ContextSpace, self)._validate_impl(is_numeric, batch) if is_numeric: if batch.ndim != 3: raise TypeError("ContectSpace should have a 3D array. Got " + str(batch.ndim)) else: if not isinstance(batch, theano.gof.Variable): raise TypeError("Not a valid syblic variable. Got " + str(batch)) if batch.ndim != 3: raise TypeError("Required a 3D tensor. Got " + str(batch) + " with %i" % batch.ndim) for val in get_debug_values(batch): self.np_validate(val)
def test_get_det_debug_values_ignore(): """get_debug_values should return [] when debugger is ignore and some values are missing """ prev_value = config.compute_test_value try: config.compute_test_value = 'ignore' x = T.vector() for x_val in op.get_debug_values(x): assert False finally: config.compute_test_value = prev_value
def _validate(self, batch): """ .. todo:: WRITEME """ if not isinstance(batch, theano.gof.Variable): raise TypeError("IndexSpace batch should be a theano Variable, " "got " + str(type(batch))) if not isinstance(batch.type, (theano.tensor.TensorType, CudaNdarrayType)): raise TypeError("VectorSpace batch should be TensorType or " "CudaNdarrayType, got "+str(batch.type)) if batch.ndim != 2: raise ValueError('IndexSpace batches must be 2D, got %d ' 'dimensions' % batch.ndim) for val in get_debug_values(batch): self.np_validate(val)
def fprop(self, state_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b if not hasattr(self, 'randomize_pools'): self.randomize_pools = False if not hasattr(self, 'pool_stride'): self.pool_stride = self.pool_size if self.randomize_pools: z = T.dot(z, self.permute) if not hasattr(self, 'min_zero'): self.min_zero = False if self.min_zero: p = T.zeros_like(z) else: p = None last_start = self.detector_layer_dim - self.pool_size for i in xrange(self.pool_size): cur = z[:,i:last_start+i+1:self.pool_stride] if p is None: p = cur else: p = T.maximum(cur, p) p.name = self.layer_name + '_p_' return p
def validate(self, batch): if not isinstance(batch, theano.gof.Variable): raise TypeError( "VectorSpace batch should be a theano Variable, got " + str(type(batch))) if not self.sparse and not isinstance( batch.type, (theano.tensor.TensorType, CudaNdarrayType)): raise TypeError( "VectorSpace batch should be TensorType or CudaNdarrayType, got " + str(batch.type)) if self.sparse and not isinstance(batch.type, theano.sparse.SparseType): raise TypeError() if batch.ndim != 2: raise ValueError( 'VectorSpace batches must be 2D, got %d dimensions' % batch.ndim) for val in get_debug_values(batch): self.np_validate(val)
def get_func(learn_discriminator, learn_generator): updates = OrderedDict() assert (learn_discriminator or learn_generator ) and not (learn_discriminator and learn_generator) if learn_discriminator: cur_params = model.discriminator.get_params() else: cur_params = model.generator.get_params() cur_grads = OrderedDict() for param in cur_params: cur_grads[param] = grads[param] for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % { 'costname': cost_value.name, 'paramname': param.name }) assert grads[param].dtype == param.dtype cur_lr_scalers = OrderedDict() for param in cur_params: if param in lr_scalers: lr_scaler = lr_scalers[param] cur_lr_scalers[param] = lr_scaler log.info('Parameter and initial learning rate summary:') for param in cur_params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * cur_lr_scalers.get(param, 1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update( self.learning_rule.get_updates(learning_rate, cur_grads, cur_lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in cur_params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.modify_updates(updates) for param in cur_params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): return function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode)
def max_pool_channels(z, pool_size, top_down=None, theano_rng=None): """ Unlike Honglak's convolutional max pooling, which pools over spatial locations within each channels, this does max pooling in a densely connected model. Here we pool groups of channels together. Parameters ---------- z : theano matrix representings a batch of input from below pool_size : int the number of features to combine into one pooled unit top_down : theano matrix, optional a theano matrix representing input from above if None, assumes top-down input is 0 theano_rng : MRG_RandomStreams, optional For random numbers for sampling Returns ------- h : theano matrix a theano matrix for the expected value of the detector layer h p : theano matrix a theano matrix for the expected value of the pooling layer p h_samples : theano matrix, only returned if theano_rng is not None a theano matrix of samples of the detector layer p_samples: theano matrix, only returned if theano_rng is not None a theano matrix of samples of the pooling layer Notes ----- all matrices are formatted as (num_example, num_features) """ z_name = z.name if z_name is None: z_name = 'anon_z' if pool_size == 1: if top_down is None: top_down = 0. total_input = z + top_down p = T.nnet.sigmoid(total_input) h = p if theano_rng is None: return p, h else: t1 = time.time() p_samples = theano_rng.binomial(p=p, size=p.shape, dtype=p.dtype, n=1) t2 = time.time() if t2 - t1 > 0.5: warnings.warn("TODO: speed up theano's random number seeding. " "max pooling spent " + str(t2 - t1) + "in a call to theano_rng.binomial.") h_samples = p_samples return p_samples, h_samples, p_samples, h_samples else: batch_size, n = z.shape mx = None if top_down is None: t = 0. else: t = -top_down t.name = 'neg_top_down' zpart = [] for i in xrange(pool_size): cur_part = z[:, i:n:pool_size] if z_name is not None: cur_part.name = z_name + '[%d]' % (i) zpart.append(cur_part) if mx is None: mx = T.maximum(t, cur_part) if cur_part.name is not None: mx.name = 'max(-top_down,' + cur_part.name + ')' else: max_name = None if cur_part.name is not None: mx_name = 'max(' + cur_part.name + ',' + mx.name + ')' mx = T.maximum(mx, cur_part) mx.name = mx_name mx.name = 'local_max(' + z_name + ')' pt = [] for i in xrange(pool_size): z_i = zpart[i] safe = z_i - mx safe.name = 'safe_z(%s)' % z_i.name cur_pt = T.exp(safe) cur_pt.name = 'pt(%s)' % z_i.name assert cur_pt.ndim == 2 pt.append(cur_pt) off_pt = T.exp(t - mx) assert off_pt.ndim == 2 off_pt.name = 'p_tilde_off(%s)' % z_name denom = off_pt for i in xrange(pool_size): denom = denom + pt[i] assert denom.ndim == 2 denom.name = 'denom(%s)' % z_name off_prob = off_pt / denom p = 1. - off_prob assert p.dtype == z.dtype hpart = [pt_i / denom for pt_i in pt] h = T.alloc(0., batch_size, n) for i in xrange(pool_size): h.name = 'h_interm' hp = hpart[i] sub_h = h[:, i:n:pool_size] assert sub_h.ndim == 2 assert hp.ndim == 2 for hv, hsv, hpartv in get_debug_values(h, sub_h, hp): print hv.shape print hsv.shape print hpartv.shape h = T.set_subtensor(sub_h, hp) p.name = 'p(%s)' % z_name h.name = 'h(%s)' % z_name if theano_rng is None: return p, h else: events = [] for i in xrange(pool_size): events.append(hpart[i]) events.append(off_prob) events = [event.dimshuffle(0, 1, 'x') for event in events] events = tuple(events) stacked_events = T.concatenate(events, axis=2) outcomes = pool_size + 1 reshaped_events = stacked_events.reshape( (batch_size * n // pool_size, outcomes)) t1 = time.time() multinomial = theano_rng.multinomial(pvals=reshaped_events, dtype=p.dtype) t2 = time.time() if t2 - t1 > 0.5: warnings.warn("TODO: speed up theano's random number seeding." "max pooling spent " + str(t2 - t1) + " in a call to theano_rng.multinomial.") reshaped_multinomial = multinomial.reshape( (batch_size, n // pool_size, outcomes)) h_sample = T.zeros_like(z) idx = 0 for i in xrange(pool_size): h_sample = T.set_subtensor(h_sample[:, i:n:pool_size], reshaped_multinomial[:, :, idx]) idx += 1 p_sample = 1 - reshaped_multinomial[:, :, -1] assert h_sample.dtype == z.dtype return p, h, p_sample, h_sample
def max_pool(z, pool_shape, top_down=None, theano_rng=None): """ Parameters ---------- z : theano 4-tensor a theano 4-tensor representing input from below pool_shape : tuple tuple of ints. the shape of regions to be pooled top_down : theano 4-tensor, optional a theano 4-tensor representing input from above if None, assumes top-down input is 0 theano_rng : MRG_RandomStreams, optional Used for random numbers for sampling Returns ------- h : theano 4-tensor the expected value of the detector layer h p : theano 4-tensor the expected value of the pooling layer p h_samples : theano 4-tensor, only returned if theano_rng is not None samples of the detector layer p_samples : theano 4-tensor, only returned if theano_rng is not None samples of the pooling layer Notes ------ all 4-tensors are formatted with axes ('b', 'c', 0, 1). This is for maximum speed when using theano's conv2d to generate z and top_down, or when using it to infer conditionals of other layers using the return values. Detailed description: Suppose you have a variable h that lives in a Conv2DSpace h_space and you want to pool it down to a variable p that lives in a smaller Conv2DSpace p. This function does that, using non-overlapping pools. Specifically, consider one channel of h. h must have a height that is a multiple of pool_shape[0] and a width that is a multiple of pool_shape[1]. A channel of h can thus be broken down into non-overlapping rectangles of shape pool_shape. Now consider one rectangular pooled region within one channel of h. I now use 'h' to refer just to this rectangle, and 'p' to refer to just the one pooling unit associated with that rectangle. We assume that the space that h and p live in is constrained such that h and p are both binary and p = max(h). To reduce the state-space in order to make probabilistic computations cheaper we also constrain sum(h) <= 1. Suppose h contains k different units. Suppose that the only term in the model's energy function involving h is -(z*h).sum() (elemwise multiplication) and the only term in the model's energy function involving p is -(top_down*p).sum(). Then P(h[i] = 1) = softmax( [ z[1], z[2], ..., z[k], -top_down] )[i] and P(p = 1) = 1-softmax( [z[1], z[2], ..., z[k], -top_down])[k] This variation of the function assumes that z, top_down, and all return values use Conv2D axes ('b', 'c', 0, 1). This variation of the function implements the softmax using a theano graph of exp, maximum, sub, and div operations. Performance notes: It might be possible to make a faster implementation with different theano ops. rather than using set_subtensor, it might be possible to use the stuff in theano.sandbox.neighbours. Probably not possible, or at least nasty, because that code isn't written with multiple channels in mind, and I don't think just a reshape can fix it. Some work on this in galatea.cond.neighbs.py At some point images2neighbs' gradient was broken so check that it has been fixed before sinking too much time into this. Stabilizing the softmax is also another source of slowness. Here it is stabilized with several calls to maximum and sub. It might also be possible to stabilize it with T.maximum(-top_down,T.signal.downsample.max_pool(z)). Don't know if that would be faster or slower. Elsewhere in this file I implemented the softmax with a reshape and call to Softmax / SoftmaxWithBias. This is slower, even though Softmax is faster on the GPU than the equivalent max/sub/exp/div graph. Maybe the reshape is too expensive. Benchmarks show that most of the time is spent in GpuIncSubtensor when running on gpu. So it is mostly that which needs a faster implementation. One other way to implement this would be with a linear.Conv2D.lmul_T, where the convolution stride is equal to the pool width, and the thing to multiply with is the hparts stacked along the channel axis. Unfortunately, conv2D doesn't work right with stride > 2 and is pretty slow for stride 2. Conv3D is used to mitigate some of this, but only has CPU code. """ z_name = z.name if z_name is None: z_name = 'anon_z' batch_size, ch, zr, zc = z.shape r, c = pool_shape zpart = [] mx = None if top_down is None: t = 0. else: t = -top_down t.name = 'neg_top_down' for i in xrange(r): zpart.append([]) for j in xrange(c): cur_part = z[:, :, i:zr:r, j:zc:c] if z_name is not None: cur_part.name = z_name + '[%d,%d]' % (i, j) zpart[i].append(cur_part) if mx is None: mx = T.maximum(t, cur_part) if cur_part.name is not None: mx.name = 'max(-top_down,' + cur_part.name + ')' else: max_name = None if cur_part.name is not None: mx_name = 'max(' + cur_part.name + ',' + mx.name + ')' mx = T.maximum(mx, cur_part) mx.name = mx_name mx.name = 'local_max(' + z_name + ')' pt = [] for i in xrange(r): pt.append([]) for j in xrange(c): z_ij = zpart[i][j] safe = z_ij - mx safe.name = 'safe_z(%s)' % z_ij.name cur_pt = T.exp(safe) cur_pt.name = 'pt(%s)' % z_ij.name pt[-1].append(cur_pt) off_pt = T.exp(t - mx) off_pt.name = 'p_tilde_off(%s)' % z_name denom = off_pt for i in xrange(r): for j in xrange(c): denom = denom + pt[i][j] denom.name = 'denom(%s)' % z_name off_prob = off_pt / denom p = 1. - off_prob p.name = 'p(%s)' % z_name hpart = [] for i in xrange(r): hpart.append([pt_ij / denom for pt_ij in pt[i]]) h = T.alloc(0., batch_size, ch, zr, zc) for i in xrange(r): for j in xrange(c): h.name = 'h_interm' h = T.set_subtensor(h[:, :, i:zr:r, j:zc:c], hpart[i][j]) h.name = 'h(%s)' % z_name if theano_rng is None: return p, h else: events = [] for i in xrange(r): for j in xrange(c): events.append(hpart[i][j]) events.append(off_prob) events = [event.dimshuffle(0, 1, 2, 3, 'x') for event in events] events = tuple(events) stacked_events = T.concatenate(events, axis=4) rows = zr // pool_shape[0] cols = zc // pool_shape[1] outcomes = pool_shape[0] * pool_shape[1] + 1 assert stacked_events.ndim == 5 for se, bs, r, c, chv in get_debug_values(stacked_events, batch_size, rows, cols, ch): assert se.shape[0] == bs assert se.shape[1] == r assert se.shape[2] == c assert se.shape[3] == chv assert se.shape[4] == outcomes reshaped_events = stacked_events.reshape( (batch_size * rows * cols * ch, outcomes)) multinomial = theano_rng.multinomial(pvals=reshaped_events, dtype=p.dtype) reshaped_multinomial = multinomial.reshape( (batch_size, ch, rows, cols, outcomes)) h_sample = T.alloc(0., batch_size, ch, zr, zc) idx = 0 for i in xrange(r): for j in xrange(c): h_sample = T.set_subtensor( h_sample[:, :, i:zr:r, j:zc:c], reshaped_multinomial[:, :, :, :, idx]) idx += 1 p_sample = 1 - reshaped_multinomial[:, :, :, :, -1] return p, h, p_sample, h_sample
def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if np.any(np.isinf(param.get_value()))] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([np.any(np.isnan(param.get_value())) for param in model.get_params()]): nan_params = [param for param in model.get_params() if np.any(np.isnan(param.get_value()))] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost learning_rate = self.learning_rate if self.monitoring_dataset is not None: self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, extra_costs=self.monitoring_costs, mode=self.monitor_iteration_mode) dataset_name = self.monitoring_dataset.keys()[0] monitoring_dataset = self.monitoring_dataset[dataset_name] #TODO: have Monitor support non-data-dependent channels self.monitor.add_channel(name='learning_rate', ipt=None, val=learning_rate, data_specs=(NullSpace(), ''), dataset=monitoring_dataset) if self.learning_rule: self.learning_rule.add_channels_to_monitor( self.monitor, monitoring_dataset) params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError(str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update(self.learning_rule.get_updates( learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.censor_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params
def lwta_3d_b012c(b012c, pool_shape, pool_stride, video_shape): """ Modified from pylearn2.models.mlp.max_pool_c01b. """ mx = None t, r, c = video_shape pt, pr, pc = pool_shape ts, rs, cs = pool_stride assert pt > 0 assert pr > 0 assert pc > 0 assert pt <= t assert pr <= r assert pc <= c # Compute index in pooled space of last needed pool # (needed = each input pixel must appear in at least one pool) def last_pool(im_shp, p_shp, p_strd): rval = int(np.ceil(float(im_shp - p_shp) / p_strd)) assert p_strd * rval + p_shp >= im_shp assert p_strd * (rval - 1) + p_shp < im_shp return rval # Compute starting row of the last pool last_pool_t = last_pool(video_shape[0], pool_shape[0], pool_stride[0]) * pool_stride[0] # Compute number of rows needed in image for all indexes to work out required_t = last_pool_t + pr last_pool_r = last_pool(video_shape[1], pool_shape[1], pool_stride[1]) * pool_stride[1] required_r = last_pool_r + pc last_pool_c = last_pool(video_shape[2], pool_shape[2], pool_stride[2]) * pool_stride[2] required_c = last_pool_c + pc for b012cv in get_debug_values(b012c): assert not np.any(np.isinf(b012cv)) assert b012cv.shape[1] == t assert b012cv.shape[2] == r assert b012cv.shape[3] == c wide_infinity = T.alloc(-np.inf, b012c.shape[0], required_t, required_r, required_c, b012c.shape[4]) name = b012c.name if name is None: name = 'anon_b012c' b012c = T.set_subtensor(wide_infinity[:, 0:t, 0:r, 0:c, :], b012c) b012c.name = 'infinite_padded_' + name for time_within_pool in xrange(pool_shape[0]): time_stop = last_pool_t + time_within_pool + 1 for row_within_pool in xrange(pool_shape[1]): row_stop = last_pool_r + row_within_pool + 1 for col_within_pool in xrange(pool_shape[2]): col_stop = last_pool_c + col_within_pool + 1 cur = b012c[:, time_within_pool:time_stop:ts, row_within_pool:row_stop:rs, col_within_pool:col_stop:cs, :] if mx is None: mx = cur else: mx = T.maximum(mx, cur) for time_within_pool in xrange(pool_shape[0]): time_stop = last_pool_t + time_within_pool + 1 for row_within_pool in xrange(pool_shape[1]): row_stop = last_pool_r + row_within_pool + 1 for col_within_pool in xrange(pool_shape[2]): col_stop = last_pool_c + col_within_pool + 1 cur = b012c[:, time_within_pool:time_stop:ts, row_within_pool:row_stop:rs, col_within_pool:col_stop:cs, :] b012c = T.set_subtensor( b012c[:, time_within_pool:time_stop:ts, row_within_pool:row_stop:rs, col_within_pool:col_stop:cs, :], cur * (cur >= mx)) b012c = b012c[:, 0:t, 0:r, 0:c, :] # remove infinity padding for b012cv in get_debug_values(b012c): assert not np.any(np.isnan(b012cv)) assert not np.any(np.isinf(b012cv)) return b012c
def setup(self, model, dataset): """ Compiles the theano functions needed for the train method. Parameters ---------- model : a Model instance dataset : Dataset """ if self.cost is None: self.cost = model.get_default_cost() inf_params = [param for param in model.get_params() if contains_inf(param.get_value())] if len(inf_params) > 0: raise ValueError("These params are Inf: "+str(inf_params)) if any([contains_nan(param.get_value()) for param in model.get_params()]): nan_params = [param for param in model.get_params() if contains_nan(param.get_value())] raise ValueError("These params are NaN: "+str(nan_params)) self.model = model self._synchronize_batch_size(model) model._test_batch_size = self.batch_size self.monitor = Monitor.get_monitor(model) self.monitor._sanity_check() # test if force batch size and batch size has_force_batch_size = getattr(model, "force_batch_size", False) train_dataset_is_uneven = \ dataset.get_num_examples() % self.batch_size != 0 has_monitoring_datasets = \ self.monitoring_dataset is not None and \ self.monitoring_dataset.values() > 0 if has_monitoring_datasets: monitoring_datasets_are_uneven = \ any(d.get_num_examples() % self.batch_size != 0 for d in self.monitoring_dataset.values()) else: monitoring_datasets_are_uneven = False # or True it doesn't matter if has_force_batch_size and train_dataset_is_uneven and \ not has_uniform_batch_size(self.train_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set train_iteration_mode (and " "maybe monitor_iteration_mode) to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") if has_force_batch_size and has_monitoring_datasets and \ monitoring_datasets_are_uneven and \ not has_uniform_batch_size(self.monitor_iteration_mode): raise ValueError("Dataset size is not a multiple of batch size." "You should set monitor_iteration_mode to " "even_sequential, even_shuffled_sequential or " "even_batchwise_shuffled_sequential") data_specs = self.cost.get_data_specs(self.model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) if cost_value is not None and cost_value.name is None: # Concatenate the name of all tensors in theano_args !? cost_value.name = 'objective' learning_rate = self.learning_rate params = list(model.get_params()) assert len(params) > 0 for i, param in enumerate(params): if param.name is None: param.name = 'sgd_params[%d]' % i grads, updates = self.cost.get_gradients(model, nested_args, ** fixed_var_descr.fixed_vars) if not isinstance(grads, OrderedDict): raise TypeError(str(type(self.cost)) + ".get_gradients returned " + "something with" + str(type(grads)) + "as its " + "first member. Expected OrderedDict.") for param in grads: assert param in params for param in params: assert param in grads for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype lr_scalers = model.get_lr_scalers() for key in lr_scalers: if key not in params: raise ValueError("Tried to scale the learning rate on " +\ str(key)+" which is not an optimization parameter.") log.info('Parameter and initial learning rate summary:') for param in params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update(self.learning_rule.get_updates( learning_rate, grads, lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.modify_updates(updates) for param in params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if contains_inf(update_val): raise ValueError("debug value of %s contains infs" % update.name) if contains_nan(update_val): raise ValueError("debug value of %s contains nans" % update.name) # Set up monitor to model the objective value, learning rate, # momentum (if applicable), and extra channels defined by # the cost. # We have to do that after learning_rule.get_updates has been # called, since it may have an effect on # learning_rule.add_channels_to_monitor (that is currently the case # for AdaDelta and RMSProp). self._setup_monitor() with log_timing(log, 'Compiling sgd_update'): self.sgd_update = function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode) self.params = params
updates = {} alpha = T.scalar() alpha.name = 'alpha' alpha.tag.test_value = lr for cost, params in [(mf1_cost, mf1mod.get_params())]: #(mfn_cost, mfnmod.get_params()) ]: for param in params: if param.name != 'mu' and param.name != 'beta': inc = sharedX(np.zeros(param.get_value().shape)) grad = T.grad(cost, param) #grad = Print('d cost / d '+param.name,attrs=['min','max'])(grad) new_inc = momentum * inc - alpha * grad for v in get_debug_values(new_inc): assert not np.any(np.isnan(v)) assert not np.any(np.isinf(v)) updates[param] = param + new_inc #updates[param] = Print('updates['+param.name+']',attrs=['min','max'])(updates[param]) for v in get_debug_values(updates[param]): assert not np.any(np.isnan(v)) assert not np.any(np.isinf(v)) updates[inc] = new_inc from theano import function func = function([Xb, yb, alpha], updates=updates) nodes = func.maker.fgraph.toposort()