class ClassBasedOutput(Softmax): def __init__(self, n_clusters = None, classclusterpath= None, **kwargs): super(ClassBasedOutput, self).__init__(**kwargs) self.n_clusters = n_clusters del self.b self.b_class = sharedX(np.zeros((self.n_clusters, self.n_classes)), name = 'softmax_b_class') self.b_cluster = sharedX( np.zeros((self.n_clusters)), name = 'softmax_b_clusters') npz_clust = serial.load(classclusterpath) array_clusters = npz_clust['wordwithclusters'] keys = range(n_clusters) self.clusters_scope = dict(zip(keys, np.bincount(array_clusters.astype(int)))) #self._group_dot = _group_dot self.array_clusters = sharedX(array_clusters) def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) if self.no_affine: desired_dim = self.n_classes assert self.input_dim == desired_dim else: desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.no_affine: self._params = [] else: if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W_cluster = rng.uniform(-self.irange,self.irange, (self.input_dim, self.n_clusters)) W_class = rng.uniform(-self.irange,self.irange, (self.n_clusters, self.input_dim, self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W_cluster = rng.randn(self.input_dim, self.n_clusters) * self.istdev W_class = rng.randn(self.n_clusters, self.input_dim, self.n_classes) * self.istdev else: raise NotImplementedError() # set the extra dummy weights to 0 #changed for key in self.clusters_scope.keys(): W_class[int(key), :, :self.clusters_scope[key]] = 0. self.W_class = sharedX(W_class, 'softmax_W_class' ) self.W_cluster = sharedX(W_cluster, 'softmax_W_cluster' ) self._params = [self.b_class, self.W_class, self.b_cluster, self.W_cluster] def get_layer_monitoring_channels(self, state_below=None, state=None, targets=NotImplementedError): if self.no_affine: return OrderedDict() W_class = self.W_class W_cluster = self.W_cluster assert W_class.ndim == 3 assert W_cluster.ndim == 2 sq_W = T.sqr(W_cluster) sq_W_class = T.sqr(W_class) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) row_norms_class = T.sqrt(sq_W_class.sum(axis=1)) col_norms_class = T.sqrt(sq_W_class.sum(axis=0)) rval = OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ('class_row_norms_min' , row_norms_class.min()), ('class_row_norms_mean' , row_norms_class.mean()), ('class_row_norms_max' , row_norms_class.max()), ('class_col_norms_min' , col_norms_class.min()), ('class_col_norms_mean' , col_norms_class.mean()), ('class_col_norms_max' , col_norms_class.max()), ]) if (state_below is not None) or (state is not None): if state is None: #for value in get_debug_values(state_below): #print 'value is'+ value state=self.fprop (state_below,targets) #print state probclass, probcluster = state mx = probclass.max(axis=1) rval.update(OrderedDict([('mean_max_class',mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ])) if targets is not None: rval['nll'] = self.cost(Y=targets,Y_hat=(probclass,probcluster)) rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32')) rval['entropy'] = rval['nll']/np.log(2).astype('float32') return rval def cost(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a softmax estimate. of Y. Returns negative log probability of Y under the Y_hat distribution. """ y_probclass, y_probcluster = Y_hat #Y = self._group_dot.fprop(Y, Y_hat) CLS = self.array_clusters[T.cast(T.argmax(Y,axis=1),'int32')] #theano.printing.Print('value of cls')(CLS) assert hasattr(y_probclass, 'owner') owner = y_probclass.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 y_probclass, = owner.inputs owner = y_probclass.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z_class ,= owner.inputs assert z_class.ndim == 2 assert hasattr(y_probcluster, 'owner') owner = y_probcluster.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 y_probcluster, = owner.inputs owner = y_probcluster.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z_cluster ,= owner.inputs assert z_cluster.ndim == 2 z_class = z_class - z_class.max(axis=1).dimshuffle(0, 'x') log_prob = z_class - T.log(T.exp(z_class).sum(axis=1).dimshuffle(0, 'x')) # we use sum and not mean because this is really one variable per row # Y = OneHotFormatter(self.n_classes).theano_expr( # T.addbroadcast(Y,0,1).dimshuffle(0).astype('uint32')) log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 # cluster z_cluster = z_cluster - z_cluster.max(axis=1).dimshuffle(0, 'x') log_prob_cls = z_cluster - T.log(T.exp(z_cluster).sum(axis=1).dimshuffle(0, 'x')) out = OneHotFormatter(self.n_clusters).theano_expr(CLS.astype('int32')) #CLS = OneHotFormatter(self.n_clusters).theano_expr( # T.addbroadcast(CLS, 1).dimshuffle(0).astype('uint32')) log_prob_of_cls = (out * log_prob_cls).sum(axis=1) assert log_prob_of_cls.ndim == 1 # p(w|history) = p(c|s) * p(w|c,s) log_prob_of = log_prob_of + log_prob_of_cls rval = log_prob_of.mean() return - rval def fprop(self, state_below,targets): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 if not hasattr(self, 'no_affine'): self.no_affine = False if self.no_affine: raise NotImplementedError() assert self.W_class.ndim == 3 assert self.W_cluster.ndim == 2 #we get the cluster by doing hW_cluster + b_cluster probcluster = T.dot(state_below, self.W_cluster) + self.b_cluster probcluster = T.nnet.softmax(probcluster) #check this line again batch_clusters = self.array_clusters[T.cast(T.argmax(targets).flatten(),'int32')] Z = T.nnet.GroupDot(self.n_clusters)(state_below, self.W_class, self.b_class, T.cast(batch_clusters,'int32')) probclass = T.nnet.softmax(Z) for value in get_debug_values(probclass): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return probclass, probcluster def get_weights_format(self): return ('v', 'h', 'h_c') def get_biases(self): return self.b_class.get_value(), self.b_cluster.get_value() def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W_cluster.get_value(), self.W_class.get_value()
class MultiSoftmax(Layer): def __init__(self, n_groups, n_classes, layer_name, irange = None, istdev = None, sparse_init = None, W_lr_scale = None, b_lr_scale = None, max_row_norm = None, no_affine = False, max_col_norm = None): """ """ if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, py_integer_types) self.output_space = MatrixSpace(n_groups, n_classes) self.b = sharedX( np.zeros((n_groups, n_classes,)), name = 'softmax_b') def get_lr_scalers(self): rval = OrderedDict() if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None if self.b_lr_scale is not None: assert isinstance(self.b_lr_scale, float) rval[self.b] = self.b_lr_scale return rval def get_monitoring_channels(self): return OrderedDict() def get_monitoring_channels_from_state(self, state, target=None): return OrderedDict() def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) if self.no_affine: desired_dim = self.n_classes assert self.input_dim == desired_dim else: desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_groups,self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W = rng.randn(self.input_dim,self.n_groups,self.n_classes) * self.istdev else: raise NotImplementedError() self.W = sharedX(W, 'softmax_W' ) self._params = [ self.b, self.W ] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 assert self.W.ndim == 3 Z = T.tensordot(state_below, self.W, axes=[[1],[0]]) + self.b rval = batched_softmax(Z) for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval def cost(self, Y, Y_hat): return self.cost_from_cost_matrix(self.cost_matrix(Y, Y_hat)) def cost_from_cost_matrix(self, cost_matrix): return cost_matrix.sum(axis=2).mean() def cost_matrix(self, Y, Y_hat): return -Y * T.log(Y_hat) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') return coeff * T.sqr(self.W).sum() def get_l1_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W = self.W return coeff * abs(W).sum() def censor_updates(self, updates): return if self.max_row_norm is not None: W = self.W if W in updates: updated_W = updates[W] row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1)) desired_norms = T.clip(row_norms, 0, self.max_row_norm) updates[W] = updated_W * (desired_norms / (1e-7 + row_norms)).dimshuffle(0, 'x') if self.max_col_norm is not None: assert self.max_row_norm is None W = self.W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
class BoltzmannIsingVisible(VisibleLayer): """ An IsingVisible whose parameters are defined in Boltzmann machine space. """ def __init__(self, nvis, bias_from_marginals = None): """ nvis: the dimension of the space bias_from_marginals: a dataset, whose marginals are used to initialize the visible biases """ self.__dict__.update(locals()) del self.self # Don't serialize the dataset del self.bias_from_marginals self.space = VectorSpace(nvis) self.input_space = self.space origin = self.space.get_origin() if bias_from_marginals is None: init_bias = np.zeros((nvis,)) else: # data is in [-1, 1], but want biases for a sigmoid init_bias = init_sigmoid_bias_from_array(bias_from_marginals.X / 2. + 0.5) # init_bias = self.boltzmann_bias = sharedX(init_bias, 'visible_bias') def get_biases(self): assert False # not really sure what this should do for this layer def set_biases(self, biases, recenter=False): assert False # not really sure what this should do for this layer def ising_bias(self, for_sampling=False): if for_sampling and self.layer_above.sampling_b_stdev is not None: return self.noisy_sampling_b return 0.5 * self.boltzmann_bias + 0.25 * self.layer_above.W.sum(axis=1) def ising_bias_numpy(self): return 0.5 * self.boltzmann_bias.get_value() + 0.25 * self.layer_above.W.get_value().sum(axis=1) def upward_state(self, total_state): return total_state def get_params(self): rval = [self.boltzmann_bias] return rval def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): assert state_below is None msg = layer_above.downward_message(state_above, for_sampling=True) bias = self.ising_bias(for_sampling=True) z = msg + bias phi = T.nnet.sigmoid(2. * z) rval = theano_rng.binomial(size = phi.shape, p = phi, dtype = phi.dtype, n = 1 ) return rval * 2. - 1. def make_state(self, num_examples, numpy_rng): driver = numpy_rng.uniform(0.,1., (num_examples, self.nvis)) on_prob = sigmoid_numpy(2. * self.ising_bias_numpy()) sample = 2. * (driver < on_prob) - 1. rval = sharedX(sample, name = 'v_sample_shared') return rval def make_symbolic_state(self, num_examples, theano_rng): mean = T.nnet.sigmoid(2. * self.ising_bias()) rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean) rval = 2. * (rval) - 1. return rval def expected_energy_term(self, state, average, state_below = None, average_below = None): # state = Print('v_state', attrs=['min', 'max'])(state) assert state_below is None assert average_below is None assert average in [True, False] self.space.validate(state) # Energy function is linear so it doesn't matter if we're averaging or not rval = -T.dot(state, self.ising_bias()) assert rval.ndim == 1 return rval def get_monitoring_channels(self): rval = OrderedDict() ising_b = self.ising_bias() rval['ising_b_min'] = ising_b.min() rval['ising_b_max'] = ising_b.max() if hasattr(self, 'noisy_sampling_b'): rval['noisy_sampling_b_min'] = self.noisy_sampling_b.min() rval['noisy_sampling_b_max'] = self.noisy_sampling_b.max() return rval
class IsingVisible(VisibleLayer): """ A DBM visible layer consisting of random variables living in a VectorSpace, with values in {-1, 1} Implements the energy function term -b^T h """ def __init__(self, nvis, bias_from_marginals = None): """ nvis: the dimension of the space bias_from_marginals: a dataset, whose marginals are used to initialize the visible biases """ self.__dict__.update(locals()) del self.self # Don't serialize the dataset del self.bias_from_marginals self.space = VectorSpace(nvis) self.input_space = self.space origin = self.space.get_origin() if bias_from_marginals is None: init_bias = np.zeros((nvis,)) else: init_bias = init_tanh_bias_from_marginals(bias_from_marginals) self.bias = sharedX(init_bias, 'visible_bias') def get_biases(self): return self.bias.get_value() def set_biases(self, biases, recenter=False): self.bias.set_value(biases) if recenter: assert self.center self.offset.set_value(sigmoid_numpy(self.bias.get_value())) def upward_state(self, total_state): return total_state def get_params(self): return [self.bias] def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): assert state_below is None msg = layer_above.downward_message(state_above) bias = self.bias z = msg + bias phi = T.nnet.sigmoid(2. * z) rval = theano_rng.binomial(size = phi.shape, p = phi, dtype = phi.dtype, n = 1 ) return rval * 2. - 1. def make_state(self, num_examples, numpy_rng): driver = numpy_rng.uniform(0.,1., (num_examples, self.nvis)) on_prob = sigmoid_numpy(2. * self.bias.get_value()) sample = 2. * (driver < on_prob) - 1. rval = sharedX(sample, name = 'v_sample_shared') return rval def make_symbolic_state(self, num_examples, theano_rng): mean = T.nnet.sigmoid(2. * self.b) rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean) rval = 2. * (rval) - 1. return rval def expected_energy_term(self, state, average, state_below = None, average_below = None): assert state_below is None assert average_below is None assert average in [True, False] self.space.validate(state) # Energy function is linear so it doesn't matter if we're averaging or not rval = -T.dot(state, self.bias) assert rval.ndim == 1 return rval
class ToyRNNPhone(Model): """ WRITEME """ def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05, non_linearity='sigmoid', use_ground_truth=True): allowed_non_linearities = {'sigmoid': T.nnet.sigmoid, 'tanh': T.tanh} self.nvis = nvis self.nhid = nhid self.hidden_transition_model = hidden_transition_model self.use_ground_truth = use_ground_truth self.alpha = sharedX(1) self.alpha_decrease_rate = 0.999 assert non_linearity in allowed_non_linearities self.non_linearity = allowed_non_linearities[non_linearity] # Space initialization self.input_space = VectorSpace(dim=self.nvis) self.hidden_space = VectorSpace(dim=self.nhid) self.output_space = VectorSpace(dim=1) self.input_source = 'features' self.target_source = 'targets' # Features-to-hidden matrix W_value = numpy.random.uniform(low=-irange, high=irange, size=(self.nvis, self.nhid)) self.W = sharedX(W_value, name='W') # Hidden biases b_value = numpy.zeros(self.nhid) self.b = sharedX(b_value, name='b') # Hidden-to-out matrix U_value = numpy.random.uniform(low=-irange, high=irange, size=(self.nhid, 1)) self.U = sharedX(U_value, name='U') # Output bias c_value = numpy.zeros(1) self.c = sharedX(c_value, name='c') def fprop_step(self, features, h_tm1, out): h_tm1 = self.hidden_space.format_as( h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid( T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c return h, out def fprop_step_prime(self, truth, features, h_tm1, out): features = T.set_subtensor(features[-1], (1 - self.alpha) * features[-1] + self.alpha * truth[-1]) h_tm1 = self.hidden_space.format_as( h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid( T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c features = T.concatenate([features[1:], out]) return features, h, out def fprop(self, data): if self.use_ground_truth: self.input_space.validate(data) features = data init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda f, h, o: self.fprop_step(f, h, o) ((h, out), updates) = theano.scan( fn=fn, sequences=[features], outputs_info=[dict(initial=init_h, taps=[-1]), init_out]) return out else: self.input_space.validate(data) features = data init_in = features[0] init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda t, f, h, o: self.fprop_step_prime(t, f, h, o) ((f, h, out), updates) = theano.scan(fn=fn, sequences=[features], outputs_info=[ init_in, dict(initial=init_h, taps=[-1]), init_out ]) return out def predict_next(self, features, h_tm1): h_tm1 = self.hidden_space.format_as( h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid( T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c return h, out def get_params(self): return [self.W, self.b, self.U, self.c] + \ self.hidden_transition_model.get_params() def get_input_source(self): return self.input_source def get_target_source(self): return self.target_source def censor_updates(self, updates): updates[self.alpha] = self.alpha_decrease_rate * self.alpha def get_monitoring_channels(self, data): rval = OrderedDict() rval['alpha'] = self.alpha return rval
class HingeLoss(Layer): def __init__(self, n_classes, layer_name, irange = None, istdev = None, no_affine=False, sparse_init = None): super(HingeLoss, self).__init__(); self.__dict__.update(locals()) del self.self self.output_space = VectorSpace(n_classes) if not self.no_affine: self.b = sharedX(np.zeros((n_classes,)), name = 'hingeloss_b') def get_monitoring_channels(self): if self.no_affine: return OrderedDict() W = self.W assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ]) @wraps(Layer.get_layer_monitoring_channels) def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): # channels that does not require state information # if self.no_affine: # rval = OrderedDict() # # W = self.W # # assert W.ndim == 2 # # sq_W = T.sqr(W) # # row_norms = T.sqrt(sq_W.sum(axis=1)) # col_norms = T.sqrt(sq_W.sum(axis=0)) # # rval = OrderedDict([('row_norms_min', row_norms.min()), # ('row_norms_mean', row_norms.mean()), # ('row_norms_max', row_norms.max()), # ('col_norms_min', col_norms.min()), # ('col_norms_mean', col_norms.mean()), # ('col_norms_max', col_norms.max()), ]) rval = OrderedDict() if (state_below is not None) or (state is not None): if state is None: state = self.fprop(state_below) mx = state.max(axis=1) rval.update(OrderedDict([ ('mean_max_class', mx.mean()), ('max_max_class', mx.max()), ('min_max_class', mx.min())])) if targets is not None: y_hat = self.target_convert(T.argmax(state, axis=1)) #Assume target is in [0,1] as binary one-hot y = self.target_convert(T.argmax(targets, axis=1)) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=targets) return rval def get_monitoring_channels_from_state(self, state, target=None): warnings.warn("Layer.get_monitoring_channels_from_state is " + \ "deprecated. Use get_layer_monitoring_channels " + \ "instead. Layer.get_monitoring_channels_from_state " + \ "will be removed on or after september 24th 2014", stacklevel=2) mx = state.max(axis=1) rval = OrderedDict([ ('mean_max_class' , mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ]) if target is not None: y_hat = self.target_convert(T.argmax(state, axis=1)) #Assume target is in [0,1] as binary one-hot y = self.target_convert(T.argmax(target, axis=1)) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=target) return rval def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.no_affine: self._params = [] else: if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W = rng.randn(self.input_dim, self.n_classes) * self.istdev else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'hingeloss_W' ) self._params = [ self.b, self.W ] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.np_format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 if not hasattr(self, 'no_affine'): self.no_affine = False if self.no_affine: rval = state_below else: assert self.W.ndim == 2 b = self.b W = self.W rval = T.dot(state_below, W) + b for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval def target_convert(self, Y): ''' converts target [0,1] to [-1, 1] ''' Y_t = 2. * Y - 1. return Y_t # def hinge_cost(self, W, Y, Y_hat, C=1.): def hinge_cost(self, Y, Y_hat): #prob = .5 * T.dot(self.W.T, self.W) + C * (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1) prob = (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1) return prob def cost(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a hinge loss estimate. of Y. """ assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert Y_hat.ndim == 2 Y_t = self.target_convert(Y) # prob = self.hinge_cost(self.W, Y_t, Y_hat) prob = self.hinge_cost(Y_t, Y_hat) assert prob.ndim == 1 rval = prob.mean() return rval def cost_matrix(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a hinge loss estimate. of Y. """ assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert Y_hat.ndim == 2 Y_t = self.target_convert(Y) # prob = self.hinge_cost(self.W, Y_t, Y_hat) prob = self.hinge_cost(Y_t, Y_hat) return prob def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') return coeff * T.sqr(self.W).sum() def get_l1_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W = self.W return coeff * abs(W).sum() @wraps(Layer._modify_updates) def _modify_updates(self, updates): if self.no_affine: return
class Factorized(Softmax): def __init__(self, n_classes, layer_name, irange = None, b_lr_scale = None, V_lr_scale = None, U_lr_scale = None, Q_lr_scale = None, Ui_lr_scale = None ): self.__dict__.update(locals()) del self.self assert isinstance(n_classes, py_integer_types) self.output_space = VectorSpace(n_classes) def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng self._params = [] V = np.zeros((self.n_classes, self.input_dim),dtype=np.float32) self.V = sharedX(V, self.layer_name + "_V" ) U = np.identity( self.input_dim) self.U = sharedX(U, self.layer_name + "_U") Q = np.zeros((self.input_dim, self.input_dim),dtype=np.float32) self.Q = sharedX(Q, self.layer_name + "_Q") Ui = np.identity(self.input_dim,dtype=np.float32) self.Ui = sharedX(Ui, self.layer_name + "_Ui") self._params = [ self.U, self.Ui, self.V, self.Q] def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 W = T.dot(self.V, self.U) assert W.ndim == 2 Z = T.dot(state_below, W.T) rval = Z for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return (rval, state_below) def get_params(self): rval = [] rval.append(self.U) rval.append(self.Ui) rval.append(self.V) rval.append(self.Q) return rval def get_lr_scalers(self): if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None if not hasattr(self, 'V_lr_scale'): self.V_lr_scale = None if not hasattr(self, 'U_lr_scale'): self.U_lr_scale = None if not hasattr(self, 'Q_lr_scale'): self.Q_lr_scale = None if not hasattr(self, 'Ui_lr_scale'): self.Ui_lr_scale = None rval = OrderedDict() if self.b_lr_scale is not None: rval[self.b] = self.b_lr_scale if self.V_lr_scale is not None: rval[self.V] = self.V_lr_scale if self.U_lr_scale is not None: rval[self.U] = self.U_lr_scale if self.Q_lr_scale is not None: rval[self.Q] = self.Q_lr_scale if self.Ui_lr_scale is not None: rval[self.Ui] = self.Ui_lr_scale return rval def cost(self, Y, Y_hat): Y_hat_true, h = Y_hat assert hasattr(Y_hat_true, 'owner') owner = Y_hat_true.owner assert owner is not None val = SqLoss()([h, self.Q, self.U, self.Ui, self.V, Y])[0] return (T.mean(val, dtype='float32'), (h, T.mean(val, axis=0))) def get_monitoring_channels(self): W = T.dot(self.V,self.U) assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ]) def censor_updates(self, updates): pass
class BinaryVector(VisibleLayer): """ A DBM visible layer consisting of binary random variables living in a VectorSpace. """ def __init__(self, nvis, bias_from_marginals=None): """ nvis: the dimension of the space bias_from_marginals: a dataset, whose marginals are used to initialize the visible biases """ self.__dict__.update(locals()) del self.self # Don't serialize the dataset del self.bias_from_marginals self.space = VectorSpace(nvis) self.input_space = self.space origin = self.space.get_origin() if bias_from_marginals is None: init_bias = np.zeros((nvis, )) else: X = bias_from_marginals.get_design_matrix() assert X.max() == 1. assert X.min() == 0. assert not np.any((X > 0.) * (X < 1.)) mean = X.mean(axis=0) mean = np.clip(mean, 1e-7, 1 - 1e-7) init_bias = inverse_sigmoid_numpy(mean) self.bias = sharedX(init_bias, 'visible_bias') def get_biases(self): return self.bias.get_value() def set_biases(self, biases): self.bias.set_value(biases) def get_total_state_space(self): return self.get_input_space() def get_params(self): return set([self.bias]) def sample(self, state_below=None, state_above=None, layer_above=None, theano_rng=None): assert state_below is None msg = layer_above.downward_message(state_above) bias = self.bias z = msg + bias phi = T.nnet.sigmoid(z) rval = theano_rng.binomial(size=phi.shape, p=phi, dtype=phi.dtype, n=1) return rval def make_state(self, num_examples, numpy_rng): driver = numpy_rng.uniform(0., 1., (num_examples, self.nvis)) mean = sigmoid_numpy(self.bias.get_value()) sample = driver < mean rval = sharedX(sample, name='v_sample_shared') return rval def expected_energy_term(self, state, average, state_below=None, average_below=None): assert state_below is None assert average_below is None assert average in [True, False] self.space.validate(state) # Energy function is linear so it doesn't matter if we're averaging or not rval = -T.dot(state, self.bias) assert rval.ndim == 1 return rval
class BinaryVector(VisibleLayer): """ A DBM visible layer consisting of binary random variables living in a VectorSpace. """ def __init__(self, nvis, bias_from_marginals = None): """ nvis: the dimension of the space bias_from_marginals: a dataset, whose marginals are used to initialize the visible biases """ self.__dict__.update(locals()) del self.self # Don't serialize the dataset del self.bias_from_marginals self.space = VectorSpace(nvis) self.input_space = self.space origin = self.space.get_origin() if bias_from_marginals is None: init_bias = np.zeros((nvis,)) else: X = bias_from_marginals.get_design_matrix() assert X.max() == 1. assert X.min() == 0. assert not np.any( (X > 0.) * (X < 1.) ) mean = X.mean(axis=0) mean = np.clip(mean, 1e-7, 1-1e-7) init_bias = inverse_sigmoid_numpy(mean) self.bias = sharedX(init_bias, 'visible_bias') def get_biases(self): return self.bias.get_value() def set_biases(self, biases): self.bias.set_value(biases) def get_total_state_space(self): return self.get_input_space() def get_params(self): return set([self.bias]) def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): assert state_below is None msg = layer_above.downward_message(state_above) bias = self.bias z = msg + bias phi = T.nnet.sigmoid(z) rval = theano_rng.binomial(size = phi.shape, p = phi, dtype = phi.dtype, n = 1 ) return rval def make_state(self, num_examples, numpy_rng): driver = numpy_rng.uniform(0.,1., (num_examples, self.nvis)) mean = sigmoid_numpy(self.bias.get_value()) sample = driver < mean rval = sharedX(sample, name = 'v_sample_shared') return rval def expected_energy_term(self, state, average, state_below = None, average_below = None): assert state_below is None assert average_below is None assert average in [True, False] self.space.validate(state) # Energy function is linear so it doesn't matter if we're averaging or not rval = -T.dot(state, self.bias) assert rval.ndim == 1 return rval
class BinaryVectorMaxPool(HiddenLayer): """ A hidden layer that does max-pooling on binary vectors. It has two sublayers, the detector layer and the pooling layer. The detector layer is its downward state and the pooling layer is its upward state. TODO: this layer uses (pooled, detector) as its total state, which can be confusing when listing all the states in the network left to right. Change this and pylearn2.expr.probabilistic_max_pooling to use (detector, pooled) """ def __init__(self, detector_layer_dim, pool_size, layer_name, irange = None, sparse_init = None, include_prob = 1.0, init_bias = 0.): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.b = sharedX( np.zeros((self.detector_layer_dim,)) + init_bias, name = layer_name + '_b') def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) if not (self.detector_layer_dim % self.pool_size == 0): raise ValueError("detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" % (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size)) self.h_space = VectorSpace(self.detector_layer_dim) self.pool_layer_dim = self.detector_layer_dim / self.pool_size self.output_space = VectorSpace(self.pool_layer_dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.detector_layer_dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.detector_layer_dim)) for i in xrange(self.detector_layer_dim): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W ,= self.transformer.get_params() assert W.name is not None def get_total_state_space(self): return CompositeSpace((self.output_space, self.h_space)) def get_params(self): assert self.b.name is not None W ,= self.transformer.get_params() assert W.name is not None return self.transformer.get_params().union([self.b]) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) W ,= self.transformer.get_params() return coeff * T.sqr(W).sum() def get_weights(self): if self.requires_reformat: # This is not really an unimplemented case. # We actually don't know how to format the weights # in design space. We got the data in topo space # and we don't have access to the dataset raise NotImplementedError() W ,= self.transformer.get_params() return W.get_value() def set_weights(self, weights): W, = self.transformer.get_params() W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def get_weights_view_shape(self): total = self.detector_layer_dim cols = self.pool_size if cols == 1: # Let the PatchViewer decidew how to arrange the units # when they're not pooled raise NotImplementedError() # When they are pooled, make each pooling unit have one row rows = total / cols return rows, cols def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W ,= self.transformer.get_params() W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): p,h = total_state self.h_space.validate(h) self.output_space.validate(p) return p def downward_state(self, total_state): p,h = total_state return h def get_monitoring_channels_from_state(self, state): P, H = state rval ={} if self.pool_size == 1: vars_and_prefixes = [ (P,'') ] else: vars_and_prefixes = [ (P, 'p_'), (H, 'h_') ] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min for key, val in [ ('max_max', v_max.max()), ('max_mean', v_max.mean()), ('max_min', v_max.min()), ('min_max', v_min.max()), ('min_mean', v_min.mean()), ('min_max', v_min.max()), ('range_max', v_range.max()), ('range_mean', v_range.mean()), ('range_min', v_range.min()), ('mean_max', v_mean.max()), ('mean_mean', v_mean.mean()), ('mean_min', v_mean.min()) ]: rval[prefix+key] = val return rval def get_l1_act_cost(self, state, target, coeff, eps = None): rval = 0. P, H = state self.output_space.validate(P) self.h_space.validate(H) if self.pool_size == 1: # If the pool size is 1 then pools = detectors # and we should not penalize pools and detectors separately assert len(state) == 2 assert isinstance(target, float) assert isinstance(coeff, float) _, state = state state = [state] target = [target] coeff = [coeff] if eps is None: eps = [0.] else: eps = [eps] else: assert all([len(elem) == 2 for elem in [state, target, coeff]]) if eps is None: eps = [0., 0.] if target[1] < target[0]: warnings.warn("Do you really want to regularize the detector units to be sparser than the pooling units?") for s, t, c, e in safe_zip(state, target, coeff, eps): assert all([isinstance(elem, float) for elem in [t, c, e]]) if c == 0.: continue m = s.mean(axis=0) assert m.ndim == 1 rval += T.maximum(abs(m-t)-e,0.).mean()*c return rval def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): if theano_rng is None: raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.") if state_above is not None: msg = layer_above.downward_message(state_above) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b p, h, p_sample, h_sample = max_pool_channels(z, self.pool_size, msg, theano_rng) return p_sample, h_sample def downward_message(self, downward_state): rval = self.transformer.lmul_T(downward_state) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.h_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16)) p_exp, h_exp, p_sample, h_sample = max_pool_channels( z = default_z, pool_size = self.pool_size, theano_rng = theano_rng) assert h_sample.dtype == default_z.dtype p_state = sharedX( self.output_space.get_origin_batch( num_examples)) t2 = time.time() f = function([], updates = { p_state : p_sample, h_state : h_sample }) t3 = time.time() f() t4 = time.time() print str(self)+'.make_state took',t4-t1 print '\tcompose time:',t2-t1 print '\tcompile time:',t3-t2 print '\texecute time:',t4-t3 p_state.name = 'p_sample_shared' h_state.name = 'h_sample_shared' return p_state, h_state def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) downward_state = self.downward_state(state) self.h_space.validate(downward_state) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(downward_state, self.b) weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_'+iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' p,h = max_pool_channels(z, self.pool_size, msg) p.name = self.layer_name + '_p_' + iter_name h.name = self.layer_name + '_h_' + iter_name return p, h
class Softmax(HiddenLayer): def __init__(self, n_classes, layer_name, irange = None, sparse_init = None, W_lr_scale = None): if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, int) self.output_space = VectorSpace(n_classes) self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b') def get_lr_scalers(self): rval = {} # Patch old pickle files if not hasattr(self, 'W_lr_scale'): self.W_lr_scale = None if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale return rval def get_total_state_space(self): return self.output_space def get_monitoring_channels_from_state(self, state): mx = state.max(axis=1) return { 'mean_max_class' : mx.mean(), 'max_max_class' : mx.max(), 'min_max_class' : mx.min() } def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) self.desired_space = VectorSpace(self.input_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes)) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'softmax_W' ) self._params = [ self.b, self.W ] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): if state_above is not None: # If you implement this case, also add a unit test for it. # Or at least add a warning that it is not tested. raise NotImplementedError() if theano_rng is None: raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.") self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) z = T.dot(state_below, self.W) + self.b h_exp = T.nnet.softmax(z) h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype) return h_sample def mf_update(self, state_below, state_above = None, layer_above = None, double_weights = False, iter_name = None): if state_above is not None: raise NotImplementedError() if double_weights: raise NotImplementedError() self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) """ from pylearn2.utils import serial X = serial.load('/u/goodfeli/galatea/dbm/inpaint/expdir/cifar10_N3_interm_2_features.pkl') state_below = Verify(X,'features')(state_below) """ assert self.W.ndim == 2 assert state_below.ndim == 2 b = self.b Z = T.dot(state_below, self.W) + b #Z = Print('Z')(Z) rval = T.nnet.softmax(Z) return rval def downward_message(self, downward_state): rval = T.dot(downward_state, self.W.T) rval = self.desired_space.format_as(rval, self.input_space) return rval def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale): """ scale is because the visible layer also goes into the cost. it uses the mean over units and examples, so that the scale of the cost doesn't change too much with batch size or example size. we need to multiply this cost by scale to make sure that it is put on the same scale as the reconstruction cost for the visible units. ie, scale should be 1/nvis """ Y_hat = Y_hat_unmasked assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z ,= owner.inputs assert z.ndim == 2 z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.exp(z).sum(axis=1).dimshuffle(0, 'x') # we use sum and not mean because this is really one variable per row log_prob_of = (Y * log_prob).sum(axis=1) masked = log_prob_of * drop_mask_Y assert masked.ndim == 1 rval = masked.mean() * scale return - rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.output_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16)) h_exp = T.nnet.softmax(default_z) h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype) p_state = sharedX( self.output_space.get_origin_batch( num_examples)) t2 = time.time() f = function([], updates = { h_state : h_sample }) t3 = time.time() f() t4 = time.time() print str(self)+'.make_state took',t4-t1 print '\tcompose time:',t2-t1 print '\tcompile time:',t3-t2 print '\texecute time:',t4-t3 h_state.name = 'softmax_sample_shared' return h_state def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) return coeff * T.sqr(self.W).sum() def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(state, self.b) weights_term = (T.dot(state_below, self.W) * state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval
class MultiSoftmax(Layer): def __init__(self, n_groups, n_classes, layer_name, irange=None, istdev=None, sparse_init=None, W_lr_scale=None, b_lr_scale=None, max_row_norm=None, no_affine=False, max_col_norm=None): """ """ if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, py_integer_types) self.output_space = MatrixSpace(n_groups, n_classes) self.b = sharedX(np.zeros(( n_groups, n_classes, )), name='softmax_b') def get_lr_scalers(self): rval = OrderedDict() if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None if self.b_lr_scale is not None: assert isinstance(self.b_lr_scale, float) rval[self.b] = self.b_lr_scale return rval def get_monitoring_channels(self): return OrderedDict() def get_monitoring_channels_from_state(self, state, target=None): return OrderedDict() def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got " + str(space) + " of type " + str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) if self.no_affine: desired_dim = self.n_classes assert self.input_dim == desired_dim else: desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.n_groups, self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W = rng.randn(self.input_dim, self.n_groups, self.n_classes) * self.istdev else: raise NotImplementedError() self.W = sharedX(W, 'softmax_W') self._params = [self.b, self.W] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[ 0] != self.mlp.batch_size: raise ValueError("state_below should have batch size " + str(self.dbm.batch_size) + " but has " + str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 assert self.W.ndim == 3 Z = T.tensordot(state_below, self.W, axes=[[1], [0]]) + self.b rval = batched_softmax(Z) for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval def cost(self, Y, Y_hat): return self.cost_from_cost_matrix(self.cost_matrix(Y, Y_hat)) def cost_from_cost_matrix(self, cost_matrix): return cost_matrix.sum(axis=2).mean() def cost_matrix(self, Y, Y_hat): return -Y * T.log(Y_hat + 0.000001) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') return coeff * T.sqr(self.W).sum() def get_l1_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W = self.W return coeff * abs(W).sum() def censor_updates(self, updates): return if self.max_row_norm is not None: W = self.W if W in updates: updated_W = updates[W] row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1)) desired_norms = T.clip(row_norms, 0, self.max_row_norm) updates[W] = updated_W * (desired_norms / (1e-7 + row_norms)).dimshuffle( 0, 'x') if self.max_col_norm is not None: assert self.max_row_norm is None W = self.W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
class ClassBasedOutput(Softmax): # TODO cleanup target, class name mess, it's confusing def __init__(self, n_clusters = None, classclusterpath= None, clusters_scope = None, **kwargs): super(ClassBasedOutput, self).__init__(**kwargs) self.n_clusters = n_clusters del self.b self.b_class = sharedX(np.zeros((self.n_clusters, self.n_classes)), name = 'softmax_b_class') self.b_cluster = sharedX( np.zeros((self.n_clusters)), name = 'softmax_b_clusters') npz_data = serial.load("${PYLEARN2_DATA_PATH}/PennTreebankCorpus/" + classclusterpath) self.classclusters=sharedX(npz_data['wordwithclusters'],'classclusters') self.cluster_targets = np.random.randint(0,n_clusters,size=(self.n_classes)) #cluster_targets is a nx1 array which tells which cluster the word keys = range(n_clusters) self.clusters_scope = dict(zip(keys, np.bincount(self.cluster_targets))) #self._group_dot = _group_dot def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) if self.no_affine: desired_dim = self.n_classes assert self.input_dim == desired_dim else: desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.no_affine: self._params = [] else: if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W_cluster = rng.uniform(-self.irange,self.irange, (self.input_dim, self.n_clusters)) W_class = rng.uniform(-self.irange,self.irange, (self.n_clusters, self.input_dim, self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W_cluster = rng.randn(self.input_dim, self.n_clusters) * self.istdev W_class = rng.randn(self.n_clusters, self.input_dim, self.n_classes) * self.istdev else: raise NotImplementedError() # set the extra dummy weights to 0 for key in self.clusters_scope.keys(): #print key #should probably be reverse W_class[int(key), :, :self.clusters_scope[key]] = 0. self.W_class = sharedX(W_class, 'softmax_W_class' ) self.W_cluster = sharedX(W_cluster, 'softmax_W_cluster' ) self._params = [self.b_class, self.W_class, self.b_cluster, self.W_cluster] def get_layer_monitoring_channels(self, state_below=None, state=None, targets=NotImplementedError): if self.no_affine: return OrderedDict() W_class = self.W_class W_cluster = self.W_cluster assert W_class.ndim == 3 assert W_cluster.ndim == 2 sq_W = T.sqr(W_cluster) sq_W_class = T.sqr(W_class) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) row_norms_class = T.sqrt(sq_W_class.sum(axis=1)) col_norms_class = T.sqrt(sq_W_class.sum(axis=0)) rval = OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ('class_row_norms_min' , row_norms_class.min()), ('class_row_norms_mean' , row_norms_class.mean()), ('class_row_norms_max' , row_norms_class.max()), ('class_col_norms_min' , col_norms_class.min()), ('class_col_norms_mean' , col_norms_class.mean()), ('class_col_norms_max' , col_norms_class.max()), ]) if (state_below is not None) or (state is not None): if state is None: for value in get_debug_values(state_below): print 'value is'+ value state=self.fprop (state_below) #print state state, cls = state mx = state.max(axis=1) rval.update(OrderedDict([('mean_max_class',mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ])) if targets is not None: rval['nll'] = self.cost(Y_hat=(state,cls), Y=targets) rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32')) rval['entropy'] = rval['nll']/np.log(2).astype('float32') return rval ### # state, cls = state def cost(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a softmax estimate. of Y. Returns negative log probability of Y under the Y_hat distribution. """ y_hat, y_cls = Y_hat #separated #have to change y as argmax #also make cls a shared variable and use that #Y, #CLS = self.classclusters[Y] #Y = self._group_dot.fprop(Y, Y_hat) CLS = self.cluster_targets assert hasattr(y_hat, 'owner') owner = y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 y_hat, = owner.inputs owner = y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) #print 'own' #print owner,op z ,= owner.inputs #print 'z:' #print z assert z.ndim == 2 assert hasattr(y_cls, 'owner') owner = y_cls.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 y_cls, = owner.inputs owner = y_cls.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z_cls ,= owner.inputs #print 'z_cls:' #print z_cls assert z_cls.ndim == 2 # Y #print z z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x')) #print log_prob #print Y.ndim # we use sum and not mean because this is really one variable per row # Y = OneHotFormatter(self.n_classes).theano_expr( # T.addbroadcast(Y,0,1).dimshuffle(0).astype('uint32')) log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 # cls z_cls = z_cls - z_cls.max(axis=1).dimshuffle(0, 'x') log_prob_cls = z_cls - T.log(T.exp(z_cls).sum(axis=1).dimshuffle(0, 'x')) # CLS = OneHotFormatter(self.n_clusters).theano_expr( # T.addbroadcast(CLS, 1).dimshuffle(0).astype('uint32')) log_prob_of_cls = (CLS * log_prob_cls).sum(axis=1) assert log_prob_of_cls.ndim == 1 # p(w|history) = p(c|s) * p(w|c,s) log_prob_of = log_prob_of + log_prob_of_cls rval = log_prob_of.mean() return - rval def fprop(self, state_below): #change model to add new variable which sends which indices of the data are here self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): print 'getting debug values' print value # if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: # raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 if not hasattr(self, 'no_affine'): self.no_affine = False if self.no_affine: raise NotImplementedError() assert self.W_class.ndim == 3 assert self.W_cluster.ndim == 2 #we get the cluster by doing hW_cluster + b_cluster probcluster = T.dot(state_below, self.W_cluster) + self.b_cluster probcluster = T.nnet.softmax(probcluster) for value in get_debug_values(probcluster): print 'val is' print val print 'type of state below is' print state_below.type print state_below.dtype print state_below.ndim self.cluster_targets = range(5) #need the predicted clusters for this batch Z = T.nnet.GroupDot(self.n_clusters)(state_below, self.W_class, self.b_class, self.cluster_targets) probclass = T.nnet.softmax(Z) for value in get_debug_values(probclass): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return probclass, probcluster def get_weights_format(self): return ('v', 'h', 'h_c') def get_biases(self): return self.b_class.get_value(), self.b_cluster.get_value() def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W_cluster.get_value(), self.W_class.get_value()
class Softmax(Layer): def __init__(self, n_classes, layer_name, irange = None, istdev = None, sparse_init = None, W_lr_scale = None, b_lr_scale = None, max_row_norm = None): """ """ if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, int) self.output_space = VectorSpace(n_classes) self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b') def get_lr_scalers(self): rval = OrderedDict() if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None if self.b_lr_scale is not None: assert isinstance(self.b_lr_scale, float) rval[self.b] = self.b_lr_scale return rval def get_monitoring_channels_from_state(self, state, target=None): mx = state.max(axis=1) rval = OrderedDict([ ('mean_max_class' , mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ]) if target is not None: y_hat = T.argmax(state, axis=1) y = T.argmax(target, axis=1) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass return rval def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) self.desired_space = VectorSpace(self.input_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W = rng.randn(self.input_dim, self.n_classes) * self.istdev else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'softmax_W' ) self._params = [ self.b, self.W ] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert self.W.ndim == 2 assert state_below.ndim == 2 b = self.b Z = T.dot(state_below, self.W) + b rval = T.nnet.softmax(Z) for value in get_debug_values(rval): assert value.shape[0] == self.mlp.batch_size return rval def cost(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a softmax estimate. of Y. Returns negative log probability of Y under the Y_hat distribution. """ assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z ,= owner.inputs assert z.ndim == 2 z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x')) # we use sum and not mean because this is really one variable per row log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 rval = log_prob_of.mean() return - rval def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') return coeff * T.sqr(self.W).sum() def censor_updates(self, updates): if self.max_row_norm is not None: W = self.W if W in updates: updated_W = updates[W] row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1)) desired_norms = T.clip(row_norms, 0, self.max_row_norm) updates[W] = updated_W * (desired_norms / (1e-7 + row_norms)).dimshuffle(0, 'x')
class HingeLoss(Layer): def __init__(self, n_classes, layer_name, irange = None, istdev = None, sparse_init = None): self.__dict__.update(locals()) del self.self self.output_space = VectorSpace(n_classes) self.b = sharedX(np.zeros((n_classes,)), name = 'hingeloss_b') def get_monitoring_channels(self): W = self.W assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ]) def get_monitoring_channels_from_state(self, state, target=None): mx = state.max(axis=1) rval = OrderedDict([ ('mean_max_class' , mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ]) if target is not None: y_hat = self.target_convert(T.argmax(state, axis=1)) #Assume target is in [0,1] as binary one-hot y = self.target_convert(T.argmax(target, axis=1)) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=target) return rval def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W = rng.randn(self.input_dim, self.n_classes) * self.istdev else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'hingeloss_W' ) self._params = [ self.b, self.W ] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 assert self.W.ndim == 2 b = self.b W = self.W rval = T.dot(state_below, W) + b for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval def target_convert(self, Y): ''' converts target [0,1] to [-1, 1] ''' Y_t = 2. * Y - 1. return Y_t def hinge_cost(self, W, Y, Y_hat, C=1.): #prob = .5 * T.dot(self.W.T, self.W) + C * (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1) prob = (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1) return prob def cost(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a hinge loss estimate. of Y. """ assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert Y_hat.ndim == 2 Y_t = self.target_convert(Y) prob = self.hinge_cost(self.W, Y_t, Y_hat) assert prob.ndim == 1 rval = prob.mean() return rval def cost_matrix(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a hinge loss estimate. of Y. """ assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert Y_hat.ndim == 2 Y_t = self.target_convert(Y) prob = self.hinge_cost(self.W, Y_t, Y_hat) return prob def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') return coeff * T.sqr(self.W).sum() def get_l1_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W = self.W return coeff * abs(W).sum()
class Softmax(HiddenLayer): def __init__(self, n_classes, layer_name, irange=None, sparse_init=None, W_lr_scale=None): if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, int) self.output_space = VectorSpace(n_classes) self.b = sharedX(np.zeros((n_classes, )), name='softmax_b') def get_lr_scalers(self): rval = {} # Patch old pickle files if not hasattr(self, 'W_lr_scale'): self.W_lr_scale = None if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale return rval def get_total_state_space(self): return self.output_space def get_monitoring_channels_from_state(self, state): mx = state.max(axis=1) return { 'mean_max_class': mx.mean(), 'max_max_class': mx.max(), 'min_max_class': mx.min() } def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got " + str(space) + " of type " + str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) self.desired_space = VectorSpace(self.input_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.n_classes)) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'softmax_W') self._params = [self.b, self.W] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def sample(self, state_below=None, state_above=None, layer_above=None, theano_rng=None): if state_above is not None: # If you implement this case, also add a unit test for it. # Or at least add a warning that it is not tested. raise NotImplementedError() if theano_rng is None: raise ValueError( "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list." ) self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) z = T.dot(state_below, self.W) + self.b h_exp = T.nnet.softmax(z) h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype) return h_sample def mf_update(self, state_below, state_above=None, layer_above=None, double_weights=False, iter_name=None): if state_above is not None: raise NotImplementedError() if double_weights: raise NotImplementedError() self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) """ from pylearn2.utils import serial X = serial.load('/u/goodfeli/galatea/dbm/inpaint/expdir/cifar10_N3_interm_2_features.pkl') state_below = Verify(X,'features')(state_below) """ assert self.W.ndim == 2 assert state_below.ndim == 2 b = self.b Z = T.dot(state_below, self.W) + b #Z = Print('Z')(Z) rval = T.nnet.softmax(Z) return rval def downward_message(self, downward_state): rval = T.dot(downward_state, self.W.T) rval = self.desired_space.format_as(rval, self.input_space) return rval def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale): """ scale is because the visible layer also goes into the cost. it uses the mean over units and examples, so that the scale of the cost doesn't change too much with batch size or example size. we need to multiply this cost by scale to make sure that it is put on the same scale as the reconstruction cost for the visible units. ie, scale should be 1/nvis """ Y_hat = Y_hat_unmasked assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z, = owner.inputs assert z.ndim == 2 z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.exp(z).sum(axis=1).dimshuffle(0, 'x') # we use sum and not mean because this is really one variable per row log_prob_of = (Y * log_prob).sum(axis=1) masked = log_prob_of * drop_mask_Y assert masked.ndim == 1 rval = masked.mean() * scale return -rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.output_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16)) h_exp = T.nnet.softmax(default_z) h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype) p_state = sharedX(self.output_space.get_origin_batch(num_examples)) t2 = time.time() f = function([], updates={h_state: h_sample}) t3 = time.time() f() t4 = time.time() print str(self) + '.make_state took', t4 - t1 print '\tcompose time:', t2 - t1 print '\tcompile time:', t3 - t2 print '\texecute time:', t4 - t3 h_state.name = 'softmax_sample_shared' return h_state def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) return coeff * T.sqr(self.W).sum() def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(state, self.b) weights_term = (T.dot(state_below, self.W) * state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval
class ToyRNNPhone(Model): """ WRITEME """ def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05, non_linearity='sigmoid', use_ground_truth=True): allowed_non_linearities = {'sigmoid': T.nnet.sigmoid, 'tanh': T.tanh} self.nvis = nvis self.nhid = nhid self.hidden_transition_model = hidden_transition_model self.use_ground_truth = use_ground_truth self.alpha = sharedX(1) self.alpha_decrease_rate = 0.999 assert non_linearity in allowed_non_linearities self.non_linearity = allowed_non_linearities[non_linearity] # Space initialization self.input_space = VectorSpace(dim=self.nvis) self.hidden_space = VectorSpace(dim=self.nhid) self.output_space = VectorSpace(dim=1) self.input_source = 'features' self.target_source = 'targets' # Features-to-hidden matrix W_value = numpy.random.uniform(low=-irange, high=irange, size=(self.nvis, self.nhid)) self.W = sharedX(W_value, name='W') # Hidden biases b_value = numpy.zeros(self.nhid) self.b = sharedX(b_value, name='b') # Hidden-to-out matrix U_value = numpy.random.uniform(low=-irange, high=irange, size=(self.nhid, 1)) self.U = sharedX(U_value, name='U') # Output bias c_value = numpy.zeros(1) self.c = sharedX(c_value, name='c') def fprop_step(self, features, h_tm1, out): h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid(T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c return h, out def fprop_step_prime(self, truth, features, h_tm1, out): features = T.set_subtensor(features[-1], (1 - self.alpha) * features[-1] + self.alpha * truth[-1]) h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid(T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c features = T.concatenate([features[1:], out]) return features, h, out def fprop(self, data): if self.use_ground_truth: self.input_space.validate(data) features = data init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda f, h, o: self.fprop_step(f, h, o) ((h, out), updates) = theano.scan(fn=fn, sequences=[features], outputs_info=[dict(initial=init_h, taps=[-1]), init_out]) return out else: self.input_space.validate(data) features = data init_in = features[0] init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda t, f, h, o: self.fprop_step_prime(t, f, h, o) ((f, h, out), updates) = theano.scan(fn=fn, sequences=[features], outputs_info=[init_in, dict(initial=init_h, taps=[-1]), init_out]) return out def predict_next(self, features, h_tm1): h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid(T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c return h, out def get_params(self): return [self.W, self.b, self.U, self.c] + \ self.hidden_transition_model.get_params() def get_input_source(self): return self.input_source def get_target_source(self): return self.target_source def censor_updates(self, updates): updates[self.alpha] = self.alpha_decrease_rate * self.alpha def get_monitoring_channels(self, data): rval = OrderedDict() rval['alpha'] = self.alpha return rval
class BinaryVectorMaxPool(HiddenLayer): """ A hidden layer that does max-pooling on binary vectors. It has two sublayers, the detector layer and the pooling layer. The detector layer is its downward state and the pooling layer is its upward state. TODO: this layer uses (pooled, detector) as its total state, which can be confusing when listing all the states in the network left to right. Change this and pylearn2.expr.probabilistic_max_pooling to use (detector, pooled) """ def __init__(self, detector_layer_dim, pool_size, layer_name, irange=None, sparse_init=None, include_prob=1.0, init_bias=0.): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.b = sharedX(np.zeros((self.detector_layer_dim, )) + init_bias, name=layer_name + '_b') def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) if not (self.detector_layer_dim % self.pool_size == 0): raise ValueError( "detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" % (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size)) self.h_space = VectorSpace(self.detector_layer_dim) self.pool_layer_dim = self.detector_layer_dim / self.pool_size self.output_space = VectorSpace(self.pool_layer_dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.detector_layer_dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.detector_layer_dim)) for i in xrange(self.detector_layer_dim): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W, = self.transformer.get_params() assert W.name is not None def get_total_state_space(self): return CompositeSpace((self.output_space, self.h_space)) def get_params(self): assert self.b.name is not None W, = self.transformer.get_params() assert W.name is not None return self.transformer.get_params().union([self.b]) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) W, = self.transformer.get_params() return coeff * T.sqr(W).sum() def get_weights(self): if self.requires_reformat: # This is not really an unimplemented case. # We actually don't know how to format the weights # in design space. We got the data in topo space # and we don't have access to the dataset raise NotImplementedError() W, = self.transformer.get_params() return W.get_value() def set_weights(self, weights): W, = self.transformer.get_params() W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def get_weights_view_shape(self): total = self.detector_layer_dim cols = self.pool_size if cols == 1: # Let the PatchViewer decidew how to arrange the units # when they're not pooled raise NotImplementedError() # When they are pooled, make each pooling unit have one row rows = total / cols return rows, cols def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W, = self.transformer.get_params() W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): p, h = total_state self.h_space.validate(h) self.output_space.validate(p) return p def downward_state(self, total_state): p, h = total_state return h def get_monitoring_channels_from_state(self, state): P, H = state rval = {} if self.pool_size == 1: vars_and_prefixes = [(P, '')] else: vars_and_prefixes = [(P, 'p_'), (H, 'h_')] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min for key, val in [('max_max', v_max.max()), ('max_mean', v_max.mean()), ('max_min', v_max.min()), ('min_max', v_min.max()), ('min_mean', v_min.mean()), ('min_max', v_min.max()), ('range_max', v_range.max()), ('range_mean', v_range.mean()), ('range_min', v_range.min()), ('mean_max', v_mean.max()), ('mean_mean', v_mean.mean()), ('mean_min', v_mean.min())]: rval[prefix + key] = val return rval def get_l1_act_cost(self, state, target, coeff, eps=None): rval = 0. P, H = state self.output_space.validate(P) self.h_space.validate(H) if self.pool_size == 1: # If the pool size is 1 then pools = detectors # and we should not penalize pools and detectors separately assert len(state) == 2 assert isinstance(target, float) assert isinstance(coeff, float) _, state = state state = [state] target = [target] coeff = [coeff] if eps is None: eps = [0.] else: eps = [eps] else: assert all([len(elem) == 2 for elem in [state, target, coeff]]) if eps is None: eps = [0., 0.] if target[1] < target[0]: warnings.warn( "Do you really want to regularize the detector units to be sparser than the pooling units?" ) for s, t, c, e in safe_zip(state, target, coeff, eps): assert all([isinstance(elem, float) for elem in [t, c, e]]) if c == 0.: continue m = s.mean(axis=0) assert m.ndim == 1 rval += T.maximum(abs(m - t) - e, 0.).mean() * c return rval def sample(self, state_below=None, state_above=None, layer_above=None, theano_rng=None): if theano_rng is None: raise ValueError( "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list." ) if state_above is not None: msg = layer_above.downward_message(state_above) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b p, h, p_sample, h_sample = max_pool_channels(z, self.pool_size, msg, theano_rng) return p_sample, h_sample def downward_message(self, downward_state): rval = self.transformer.lmul_T(downward_state) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.h_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16)) p_exp, h_exp, p_sample, h_sample = max_pool_channels( z=default_z, pool_size=self.pool_size, theano_rng=theano_rng) assert h_sample.dtype == default_z.dtype p_state = sharedX(self.output_space.get_origin_batch(num_examples)) t2 = time.time() f = function([], updates={p_state: p_sample, h_state: h_sample}) t3 = time.time() f() t4 = time.time() print str(self) + '.make_state took', t4 - t1 print '\tcompose time:', t2 - t1 print '\tcompile time:', t3 - t2 print '\texecute time:', t4 - t3 p_state.name = 'p_sample_shared' h_state.name = 'h_sample_shared' return p_state, h_state def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError( "self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) downward_state = self.downward_state(state) self.h_space.validate(downward_state) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(downward_state, self.b) weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def mf_update(self, state_below, state_above, layer_above=None, double_weights=False, iter_name=None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError( "self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_' + layer_above.layer_name + '_to_' + self.layer_name + '[' + iter_name + ']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_' + iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' p, h = max_pool_channels(z, self.pool_size, msg) p.name = self.layer_name + '_p_' + iter_name h.name = self.layer_name + '_h_' + iter_name return p, h
class L2SquareHinge(Layer): """ A layer that can apply an affine transformation and use a l2 regularized square hinge loss. Parameters ---------- n_classes : int Number of classes for softmax targets. layer_name : string Name of Softmax layers. irange : float If specified, initialized each weight randomly in U(-irange, irange). istdev : float If specified, initialize each weight randomly from N(0,istdev). sparse_init : int If specified, initial sparse_init number of weights for each unit from N(0,1). W_lr_scale : float Scale for weight learning rate. b_lr_scale : float Scale for bias learning rate. max_row_norm : float Maximum norm for a row of the weight matrix. no_affine : boolean If True, softmax nonlinearity is applied directly to inputs. max_col_norm : float Maximum norm for a column of the weight matrix. init_bias_target_marginals : dataset Take the probability distribution of the targets into account to intelligently initialize biases. binary_target_dim : int, optional If your targets are class labels (i.e. a binary vector) then set the number of targets here so that an IndexSpace of the proper dimension can be used as the target space. This allows the softmax to compute the cost much more quickly than if it needs to convert the targets into a VectorSpace. """ def __init__(self, n_classes, layer_name, C=0.1, irange=None, istdev=None, sparse_init=None, W_lr_scale=None, b_lr_scale=None, max_row_norm=None, no_affine=False, max_col_norm=None, init_bias_target_marginals=None, binary_target_dim=None): super(L2SquareHinge, self).__init__() if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self del self.init_bias_target_marginals assert isinstance(n_classes, py_integer_types) if binary_target_dim is not None: assert isinstance(binary_target_dim, py_integer_types) self._has_binary_target = True self._target_space = IndexSpace(dim=binary_target_dim, max_labels=n_classes) else: self._has_binary_target = False self.output_space = VectorSpace(n_classes) self.b = sharedX(np.zeros((n_classes, )), name='hinge_b') if init_bias_target_marginals: y = init_bias_target_marginals.y if init_bias_target_marginals.y_labels is None: marginals = y.mean(axis=0) else: # compute class frequencies if np.max(y.shape) != np.prod(y.shape): raise AssertionError("Use of " "`init_bias_target_marginals` " "requires that each example has " "a single label.") marginals = np.bincount(y.flat) / float(y.shape[0]) assert marginals.ndim == 1 b = pseudoinverse_softmax_numpy(marginals).astype(self.b.dtype) assert b.ndim == 1 assert b.dtype == self.b.dtype self.b.set_value(b) else: assert init_bias_target_marginals is None @wraps(Layer.get_lr_scalers) def get_lr_scalers(self): rval = OrderedDict() if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None if self.b_lr_scale is not None: assert isinstance(self.b_lr_scale, float) rval[self.b] = self.b_lr_scale return rval @wraps(Layer.get_monitoring_channels) def get_monitoring_channels(self): warnings.warn("Layer.get_monitoring_channels is " + \ "deprecated. Use get_layer_monitoring_channels " + \ "instead. Layer.get_monitoring_channels " + \ "will be removed on or after september 24th 2014", stacklevel=2) W = self.W assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([ ('row_norms_min', row_norms.min()), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ]) @wraps(Layer.get_monitoring_channels_from_state) def get_monitoring_channels_from_state(self, state, target=None): warnings.warn("Layer.get_monitoring_channels_from_state is " + \ "deprecated. Use get_layer_monitoring_channels " + \ "instead. Layer.get_monitoring_channels_from_state " + \ "will be removed on or after september 24th 2014", stacklevel=2) # channels that does not require state information W = self.W assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) rval = OrderedDict([ ('row_norms_min', row_norms.min()), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ]) mx = state.max(axis=1) rval.update( OrderedDict([('mean_max_class', mx.mean()), ('max_max_class', mx.max()), ('min_max_class', mx.min())])) if target is not None: y_hat = T.argmax(state, axis=1) y = T.argmax(target, axis=1) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=target) return rval @wraps(Layer.get_layer_monitoring_channels) def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): # channels that does not require state information W = self.W assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) rval = OrderedDict([ ('row_norms_min', row_norms.min()), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ]) if (state_below is not None) or (state is not None): if state is None: state = self.fprop(state_below) mx = state.max(axis=1) rval.update( OrderedDict([('mean_max_class', mx.mean()), ('max_max_class', mx.max()), ('min_max_class', mx.min())])) if targets is not None: y_hat = T.argmax(state, axis=1) y = T.argmax(targets, axis=1) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=targets) return rval @wraps(Layer.set_input_space) def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got " + str(space) + " of type " + str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.no_affine: self._params = [] else: print(self.input_dim, self.n_classes) if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W = rng.randn(self.input_dim, self.n_classes) * self.istdev else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'hinge_W') self._params = [self.b, self.W] @wraps(Layer.get_weights_topo) def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.np_format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval @wraps(Layer.get_weights) def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() @wraps(Layer.set_weights) def set_weights(self, weights): self.W.set_value(weights) @wraps(Layer.set_biases) def set_biases(self, biases): self.b.set_value(biases) @wraps(Layer.get_biases) def get_biases(self): return self.b.get_value() @wraps(Layer.get_weights_format) def get_weights_format(self): return ('v', 'h') @wraps(Layer.fprop) def fprop(self, state_below): ## Precondition self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) assert state_below.ndim == 2 assert self.W.ndim == 2 ## Linear prediction rval = T.dot(state_below, self.W) + self.b return rval def hinge_cost(self, Y, Y_hat): ### print size of Y_hat #Y = Print(message="Y")(Y) #Y_hat = Print(message="Y_hat")(Y_hat) prob = (self.C * self.W.norm(2) + (T.maximum(0, 1 - (Y - Y_hat))**2.)).sum(axis=1) #.W = Print(message="W")(self.W) #prob = (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=0) #prob = Print(message="prob")(prob) return prob @wraps(Layer.cost) def cost(self, Y, Y_hat): return self.hinge_cost(Y, Y_hat).mean() # @wraps(Layer.cost_matrix) # def cost_matrix(self, Y, Y_hat): # # cost = self._cost(Y, Y_hat) # # if self._has_binary_target: # # flat_Y = Y.flatten() # # flat_matrix = T.alloc(0, (Y.shape[0]*cost.shape[1])) # # flat_indices = flat_Y + T.extra_ops.repeat( # # T.arange(Y.shape[0])*cost.shape[1], Y.shape[1] # # ) # # cost = T.set_subtensor(flat_matrix[flat_indices], flat_Y) # # return cost # return None @wraps(Layer.get_weight_decay) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') return coeff * T.sqr(self.W).sum() @wraps(Layer.get_l1_weight_decay) def get_l1_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W = self.W return coeff * abs(W).sum() @wraps(Layer._modify_updates) def _modify_updates(self, updates): if self.no_affine: return if self.max_row_norm is not None: W = self.W if W in updates: updated_W = updates[W] row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1)) desired_norms = T.clip(row_norms, 0, self.max_row_norm) scales = desired_norms / (1e-7 + row_norms) updates[W] = updated_W * scales.dimshuffle(0, 'x') if self.max_col_norm is not None: assert self.max_row_norm is None W = self.W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))