class SingleSoftmax(Initializable): def __init__(self, hidden_dim, n_classes, **kwargs): super(SingleSoftmax, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes self.mlp = MLP(activations=[Rectifier(), Softmax()], dims=[hidden_dim, hidden_dim/2, self.n_classes], weights_init=Orthogonal(), biases_init=Constant(0)) self.softmax = Softmax() self.children = [self.mlp, self.softmax] # some day: @application(...) def feedback(self, h) @application(inputs=['cs', 'y'], outputs=['cost']) def cost(self, cs, y, n_patches): energies = [self.mlp.apply(cs[:, t, :]) for t in xrange(n_patches)] cross_entropies = [self.softmax.categorical_cross_entropy(y.flatten(), energy) for energy in energies] error_rates = [T.neq(y, energy.argmax(axis=1)).mean(axis=0) for energy in energies] # train on final prediction cost = util.named(cross_entropies[-1], "cost") # monitor final prediction self.add_auxiliary_variable(cross_entropies[-1], name="cross_entropy") self.add_auxiliary_variable(error_rates[-1], name="error_rate") return cost
class SingleSoftmax(Initializable): def __init__(self, hidden_dim, n_classes, **kwargs): super(SingleSoftmax, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes self.mlp = MLP(activations=[Rectifier(), Softmax()], dims=[hidden_dim, hidden_dim / 2, self.n_classes], weights_init=Orthogonal(), biases_init=Constant(0)) self.softmax = Softmax() self.children = [self.mlp, self.softmax] # some day: @application(...) def feedback(self, h) @application(inputs=['cs', 'y'], outputs=['cost']) def cost(self, cs, y, n_patches): energies = [self.mlp.apply(cs[:, t, :]) for t in xrange(n_patches)] cross_entropies = [ self.softmax.categorical_cross_entropy(y.flatten(), energy) for energy in energies ] error_rates = [ T.neq(y, energy.argmax(axis=1)).mean(axis=0) for energy in energies ] # train on final prediction cost = util.named(cross_entropies[-1], "cost") # monitor final prediction self.add_auxiliary_variable(cross_entropies[-1], name="cross_entropy") self.add_auxiliary_variable(error_rates[-1], name="error_rate") return cost
class Emitter(Initializable): def __init__(self, hidden_dim, n_classes, **kwargs): super(Emitter, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes # TODO: use TensorLinear or some such self.emitters = [ MLP(activations=[Rectifier(), Identity()], dims=[hidden_dim, hidden_dim / 2, n], name="mlp_%i" % i, weights_init=Orthogonal(), biases_init=Constant(0)) for i, n in enumerate(self.n_classes) ] self.softmax = Softmax() self.children = self.emitters + [self.softmax] # some day: @application(...) def feedback(self, h) @application(inputs=['cs', 'y'], outputs=['cost']) def cost(self, cs, y, n_patches): max_length = len(self.n_classes) - 1 _length_masks = theano.shared(np.tril( np.ones((max_length, max_length), dtype='int8')), name='shared_length_masks') lengths = y[:, -1] length_masks = _length_masks[lengths] mean_cross_entropies = [] error_rates = [] for t in xrange(n_patches): energies = [ emitter.apply(cs[:, t, :]) for emitter in self.emitters ] mean_cross_entropies.append( sum( self.softmax.categorical_cross_entropy(y[:, i], energy) # to avoid punishing predictions of nonexistent digits: * (length_masks[:, i] if i < max_length else 1) for i, energy in enumerate(energies)).mean()) # FIXME: do proper logprob-minimizing prediction of length error_rates.append( T.stack(*[ T.neq(y[:, i], energy.argmax(axis=1)) # to avoid punishing predictions of nonexistent digits: * (length_masks[:, i] if i < max_length else 1) for i, energy in enumerate(energies) ]).any(axis=0).mean()) self.add_auxiliary_variable(mean_cross_entropies[-1], name="cross_entropy") self.add_auxiliary_variable(error_rates[-1], name="error_rate") # minimize the mean cross entropy over time and over batch cost = mean_cross_entropies[-1] return cost
class Emitter(Initializable): def __init__(self, hidden_dim, n_classes, batch_normalize, **kwargs): super(Emitter, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes # TODO: use TensorLinear or some such self.emitters = [ masonry.construct_mlp( activations=[None, Identity()], input_dim=hidden_dim, hidden_dims=[hidden_dim/2, n], name="mlp_%i" % i, batch_normalize=batch_normalize, initargs=dict(weights_init=Orthogonal(), biases_init=Constant(0))) for i, n in enumerate(self.n_classes)] self.softmax = Softmax() self.children = self.emitters + [self.softmax] # some day: @application(...) def feedback(self, h) @application(inputs=['cs', 'y'], outputs=['cost']) def cost(self, cs, y, n_patches): max_length = len(self.n_classes) - 1 _length_masks = theano.shared( np.tril(np.ones((max_length, max_length), dtype='int8')), name='shared_length_masks') lengths = y[:, -1] length_masks = _length_masks[lengths] mean_cross_entropies = [] error_rates = [] for t in xrange(n_patches): energies = [emitter.apply(cs[:, t, :]) for emitter in self.emitters] mean_cross_entropies.append( sum(self.softmax.categorical_cross_entropy(y[:, i], energy) # to avoid punishing predictions of nonexistent digits: * (length_masks[:, i] if i < max_length else 1) for i, energy in enumerate(energies)).mean()) # FIXME: do proper logprob-minimizing prediction of length error_rates.append( T.stack(*[T.neq(y[:, i], energy.argmax(axis=1)) # to avoid punishing predictions of nonexistent digits: * (length_masks[:, i] if i < max_length else 1) for i, energy in enumerate(energies)]).any(axis=0).mean()) self.add_auxiliary_variable(mean_cross_entropies[-1], name="cross_entropy") self.add_auxiliary_variable(error_rates[-1], name="error_rate") # minimize the mean cross entropy over time and over batch cost = mean_cross_entropies[-1] return cost
class Seq2Seq(Initializable): """ seq2seq model Parameters ---------- emb_dim: int The dimension of word embeddings (including for def model if standalone) dim : int The dimension of the RNNs states (including for def model if standalone) num_input_words : int The size of the LM's input vocabulary. num_output_words : int The size of the LM's output vocabulary. vocab The vocabulary object. """ def __init__(self, emb_dim, dim, num_input_words, num_output_words, vocab, proximity_coef=0, proximity_distance='l2', encoder='lstm', decoder='lstm', shared_rnn=False, translate_layer=None, word_dropout=0., tied_in_out=False, vocab_keys=None, seed=0, reconstruction_coef=1., provide_targets=False, **kwargs): """ translate_layer: either a string containing the activation function to use either a list containg the list of activations for a MLP """ if emb_dim == 0: emb_dim = dim if num_input_words == 0: num_input_words = vocab.size() if num_output_words == 0: num_output_words = vocab.size() self._word_dropout = word_dropout self._tied_in_out = tied_in_out if not encoder: if proximity_coef: raise ValueError("Err: meaningless penalty term (no encoder)") if not vocab_keys: raise ValueError("Err: specify a key vocabulary (no encoder)") if tied_in_out and num_input_words != num_output_words: raise ValueError("Can't tie in and out embeddings. Different " "vocabulary size") if shared_rnn and (encoder != 'lstm' or decoder != 'lstm'): raise ValueError( "can't share RNN because either encoder or decoder" "is not an RNN") if shared_rnn and decoder == 'lstm_c': raise ValueError( "can't share RNN because the decoder takes different" "inputs") if word_dropout < 0 or word_dropout > 1: raise ValueError("invalid value for word dropout", str(word_dropout)) if proximity_distance not in ['l1', 'l2', 'cos']: raise ValueError( "unrecognized distance: {}".format(proximity_distance)) if proximity_coef and emb_dim != dim and not translate_layer: raise ValueError( """if proximity penalisation, emb_dim should equal dim or there should be a translate layer""") if encoder not in [ None, 'lstm', 'bilstm', 'mean', 'weighted_mean', 'max_bilstm', 'bilstm_sum', 'max_bilstm_sum' ]: raise ValueError('encoder not recognized') if decoder not in ['skip-gram', 'lstm', 'lstm_c']: raise ValueError('decoder not recognized') self._proximity_distance = proximity_distance self._decoder = decoder self._encoder = encoder self._num_input_words = num_input_words self._num_output_words = num_output_words self._vocab = vocab self._proximity_coef = proximity_coef self._reconstruction_coef = reconstruction_coef self._provide_targets = provide_targets self._word_to_id = WordToIdOp(self._vocab) if vocab_keys: self._key_to_id = WordToIdOp(vocab_keys) children = [] if encoder or (not encoder and decoder in ['lstm', 'lstm_c']): self._main_lookup = LookupTable(self._num_input_words, emb_dim, name='main_lookup') children.append(self._main_lookup) if provide_targets: # this is useful to simulate Hill's baseline without pretrained embeddings # in the encoder, only as targets for the encoder. self._target_lookup = LookupTable(self._num_input_words, emb_dim, name='target_lookup') children.append(self._target_lookup) if not encoder: self._key_lookup = LookupTable(vocab_keys.size(), emb_dim, name='key_lookup') children.append(self._key_lookup) elif encoder == 'lstm': self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = LSTM(dim, name='encoder_rnn') children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder in ['bilstm', 'max_bilstm']: # dim is the dim of the concatenated vector self._encoder_fork = Linear(emb_dim, 2 * dim, name='encoder_fork') self._encoder_rnn = Bidirectional(LSTM(dim / 2, name='encoder_rnn')) children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder in ['bilstm_sum', 'max_bilstm_sum']: self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork') self._encoder_rnn = BidirectionalSum(LSTM(dim, name='encoder_rnn')) children.extend([self._encoder_fork, self._encoder_rnn]) elif encoder == 'mean': pass elif encoder == 'weighted_mean': self._encoder_w = MLP([Logistic()], [dim, 1], name="encoder_weights") children.extend([self._encoder_w]) else: raise NotImplementedError() if decoder in ['lstm', 'lstm_c']: dim_after_translate = emb_dim if shared_rnn: self._decoder_fork = self._encoder_fork self._decoder_rnn = self._encoder_rnn else: if decoder == 'lstm_c': dim_2 = dim + emb_dim else: dim_2 = dim self._decoder_fork = Linear(dim_2, 4 * dim, name='decoder_fork') self._decoder_rnn = LSTM(dim, name='decoder_rnn') children.extend([self._decoder_fork, self._decoder_rnn]) elif decoder == 'skip-gram': dim_after_translate = emb_dim self._translate_layer = None activations = {'sigmoid': Logistic(), 'tanh': Tanh(), 'linear': None} if translate_layer: if type(translate_layer) == str: translate_layer = [translate_layer] assert (type(translate_layer) == list) activations_translate = [activations[a] for a in translate_layer] dims_translate = [ dim, ] * len(translate_layer) + [dim_after_translate] self._translate_layer = MLP(activations_translate, dims_translate, name="translate_layer") children.append(self._translate_layer) if not self._tied_in_out: self._pre_softmax = Linear(emb_dim, self._num_output_words) children.append(self._pre_softmax) if decoder in ['lstm', 'lstm_c']: self._softmax = NDimensionalSoftmax() elif decoder in ['skip-gram']: self._softmax = Softmax() children.append(self._softmax) super(Seq2Seq, self).__init__(children=children, **kwargs) def _allocate(self): pass def _initialize(self): pass def get_embeddings_entries(self): return self._vocab.words def set_def_embeddings(self, embeddings, lookup='main'): if lookup == 'main': self._main_lookup.parameters[0].set_value( embeddings.astype(floatX)) elif lookup == 'target': self._target_lookup.parameters[0].set_value( embeddings.astype(floatX)) else: raise ValueError('Requested embedding not understood') def get_def_embeddings_params(self, lookup='main'): if lookup == 'main': return self._main_lookup.parameters[0] elif lookup == 'key': return self._key_lookup.parameters[0] elif lookup == 'target': return self._target_lookup.parameters[0] else: raise ValueError('Requested embedding not understood') def add_perplexity_measure(self, application_call, minus_logs, mask, name): sum_ce = (minus_logs * mask).sum() perplexity = T.exp(sum_ce / mask.sum()) perplexity.tag.aggregation_scheme = Perplexity(sum_ce, mask.sum()) application_call.add_auxiliary_variable(perplexity, name=name) return sum_ce / mask.sum() @application def apply(self, application_call, words, mask, keys=None, n_identical_keys=None, train_phase=True): """Compute the log-likelihood for a batch of sequences. words An integer matrix of shape (B, T), where T is the number of time step, B is the batch size. Note that this order of the axis is different from what all RNN bricks consume, hence and the axis should be transposed at some point. mask A float32 matrix of shape (B, T). Zeros indicate the padding. keys An integer matrix of shape (B). It contains the words that are defined in the corresponding rows in words. """ if not keys and self._proximity_coef != 0: raise ValueError( "Err: should provide keys when using penalty term") if not self._encoder and not keys: raise ValueError("Err: should provide keys (no encoder)") word_ids = self._word_to_id(words) if keys: key_ids = self._word_to_id(keys) # dropout unk = self._vocab.unk if self._word_dropout > 0 and train_phase: dropout_mask = T.ones_like(word_ids, dtype=int) dropout_mask = get_dropout_mask(dropout_mask, self._word_dropout) # this gives a matrix of 0 (dropped word) and ones (kept words) # replace 0s by unk token and 1s by word ids word_ids_dropped = (T.eq(dropout_mask, 1) * word_ids + T.eq(dropout_mask, 0) * unk) word_ids_in = word_ids_dropped else: word_ids_in = word_ids # shortlisting # input_word_ids uses word dropout input_word_ids = ( T.lt(word_ids_in, self._num_input_words) * word_ids_in + T.ge(word_ids_in, self._num_input_words) * unk) output_word_ids = (T.lt(word_ids, self._num_output_words) * word_ids + T.ge(word_ids, self._num_output_words) * unk) if self._encoder or self._decoder != 'skip-gram': input_embeddings = self._main_lookup.apply(input_word_ids) # Encoder if self._encoder == 'lstm' or 'bilstm' in self._encoder: encoder_rnn_states = self._encoder_rnn.apply(T.transpose( self._encoder_fork.apply(input_embeddings), (1, 0, 2)), mask=mask.T)[0] if self._encoder in ['lstm', 'bilstm', 'bilstm_sum']: gen_embeddings = encoder_rnn_states[-1] elif self._encoder in ['max_bilstm', 'max_bilstm_sum']: mask_bc = T.addbroadcast(mask.dimshuffle(0, 1, 'x'), 2) # (bs,L,dim) gen_embeddings = (input_embeddings * mask_bc + (1 - mask_bc) * -10**8).max(axis=1) else: raise ValueError("encoder {} apply not specific".format( self._encoder)) elif self._encoder == 'mean': mask_bc = T.addbroadcast(mask.dimshuffle(0, 1, 'x'), 2) gen_embeddings = (input_embeddings * mask_bc).mean(axis=1) elif self._encoder == 'weighted_mean': mask_bc = T.addbroadcast(mask.dimshuffle(0, 1, 'x'), 2) weights = self._encoder_w.apply(input_embeddings) weights = T.addbroadcast(weights, 2) weights = weights * mask_bc gen_embeddings = (input_embeddings * weights).mean(axis=1) elif not self._encoder: gen_embeddings = self._key_lookup.apply(key_ids) else: raise NotImplementedError() # Optional translation layer if self._translate_layer: in_decoder = self._translate_layer.apply(gen_embeddings) else: in_decoder = gen_embeddings # (bs, dim) application_call.add_auxiliary_variable(in_decoder.copy(), name="embeddings") # Decoder if self._decoder in ['lstm', 'lstm_c']: if self._decoder == 'lstm_c': tiled_in_decoder = T.tile(in_decoder.dimshuffle(0, 'x', 1), (input_embeddings.shape[1], 1)) input_embeddings = T.concatenate( [input_embeddings, tiled_in_decoder], axis=2) decoded = self._decoder_rnn.apply( inputs=T.transpose(self._decoder_fork.apply(input_embeddings), (1, 0, 2)), mask=mask.T, states=in_decoder)[0] # size (L, bs, dim) n_dim_decoded = 3 elif self._decoder == 'skip-gram': decoded = in_decoder # size (bs, dim) n_dim_decoded = 2 else: raise NotImplementedError() # we ignore the <bos> token targets = output_word_ids.T[1:] # (L-1, bs) targets_mask = mask.T[1:] # (L-1,bs) # Compute log probabilities if n_dim_decoded == 2: # Case where we have only one distrib for all timesteps: skip-gram if self._tied_in_out: W_out = self.get_def_embeddings_params().transpose( ) # (dim, V) logits = T.dot(decoded, W_out) # (bs, dim) x (dim,V) = (bs,V) else: logits = self._pre_softmax.apply(decoded) # (bs, V) size_batch, length_sentence = output_word_ids.shape normalized_logits = self._softmax.log_probabilities( logits) # (bs, V) indices = (targets.T + T.addbroadcast( (T.arange(size_batch) * logits.shape[1]).dimshuffle( 0, 'x'), 1)).flatten() # (bs*L) minus_logs = -normalized_logits.flatten()[indices].reshape( (size_batch, length_sentence - 1)).T # (L-1, bs) elif n_dim_decoded == 3: # Case where decoding is time dependent: recurrent decoders if self._tied_in_out: raise NotImplementedError() # TODO: implement... seems annoying because we need to replace # in the already implemented block code else: logits = self._pre_softmax.apply(decoded[:-1]) # (L-1, bs, V) minus_logs = self._softmax.categorical_cross_entropy( targets, logits, extra_ndim=1) avg_CE = self.add_perplexity_measure(application_call, minus_logs, targets_mask, "perplexity") costs = self._reconstruction_coef * avg_CE if self._proximity_coef > 0: if not self._encoder: key_ids = self._key_to_id(keys) else: key_ids = self._word_to_id(keys) # shortlist: if we don't use all the input embeddings, we need to shortlist # so that there isn't a key error key_ids = (T.lt(key_ids, self._num_input_words) * key_ids + T.ge(key_ids, self._num_input_words) * unk) if self._provide_targets: key_embeddings = self._target_lookup.apply( key_ids) #(bs, emb_dim) else: key_embeddings = self._main_lookup.apply( key_ids) #(bs, emb_dim) # don't penalize on UNK: mask = T.neq(key_ids, unk) * T.lt(key_ids, self._num_input_words) # average over dimension, and then manual averaging using the mask eps = T.constant(10**-6) if self._proximity_distance in ['l1', 'l2']: if self._proximity_distance == 'l1': diff_embeddings = T.abs_(key_embeddings - in_decoder) else: diff_embeddings = (key_embeddings - in_decoder)**2 mask = mask.reshape((-1, 1)) sum_proximity_term = T.sum( T.mean(diff_embeddings * mask, axis=1)) proximity_term = sum_proximity_term / (T.sum(mask) + eps) elif self._proximity_distance == 'cos': # numerator # TODO: debug mask = mask.reshape((-1, 1)) # (bs, 1) masked_keys = key_embeddings * mask masked_gen = in_decoder * mask dot_product_vector = T.sum(masked_keys * masked_gen, axis=1) #(bs) # denominator product_sqr_norms = T.sum((masked_keys)**2, axis=1) * T.sum( (masked_gen)**2, axis=1) denominator = T.sqrt(product_sqr_norms + eps) #(bs) proximity_term = -T.sum(dot_product_vector / denominator) / ( T.sum(mask) + eps) application_call.add_auxiliary_variable(proximity_term.copy(), name="proximity_term") costs = costs + self._proximity_coef * proximity_term return costs
rect = Rectifier() mlp = MLP(dims=[784, 1200, 1200, 200], activations=[rect, rect, rect], seed=10) mlp.weights_init = Uniform(0.0, 0.01) mlp.biases_init = Constant(0.0) mlp.initialize() lin = Linear(200, 10, use_bias=True) lin.weights_init = Uniform(0.0, 0.01) lin.biases_init = Constant(0.0) lin.initialize() train_out = lin.apply(mlp.apply(flat_x)) test_out = lin.apply(mlp.apply(flat_x)) sm = Softmax(name='softmax') loss = sm.categorical_cross_entropy(flat_y, train_out).mean() loss.name = 'nll' misclass = MisclassificationRate().apply(flat_y, train_out) misclass.name = 'misclass' test_loss = sm.categorical_cross_entropy(flat_y, test_out).mean() test_loss.name = 'nll' test_misclass = MisclassificationRate().apply(flat_y, test_out) test_misclass.name = 'misclass' model = Model(loss) ###################### # Data ###################### import numpy
################### from blocks.bricks import Softmax from blocks.bricks.cost import MisclassificationRate W2 = theano.shared( numpy.random.normal(size=(n_out, num_protos)).astype('float32')) b = theano.shared(numpy.zeros((num_protos, )).astype('float32')) y = tensor.ivector('y') h = tensor.dot(h3, W2) + b h = tensor.switch(h < 0, -h, h) sm = Softmax() pred = sm.apply(h) misclass = MisclassificationRate().apply(y, pred) c = sm.categorical_cross_entropy(y, h).mean() s_params = [W2, b] s_grad = theano.grad(c, s_params) s_updates = [p - numpy.float32(0.05) * g for p, g in zip(s_params, s_grad)] s_f = theano.function([h3, y], [c, misclass], updates=zip(s_params, s_updates)) s_pred = theano.function([h3], pred) for j in range(200): for i in range(n_batches): if i == 0: print s_f(data[i * batch_size:(i + 1) * batch_size, :], labels[i * batch_size:(i + 1) * batch_size]) else: s_f(data[i * batch_size:(i + 1) * batch_size, :], labels[i * batch_size:(i + 1) * batch_size])
class LookUpTrain(Initializable, Feedforward): @lazy(allocation=['dwin', 'n_mot', 'vect_size', 'n_hidden']) def __init__(self, dwin, n_mot, vect_size, n_hidden, n_out=2, **kwargs): self.dwin = dwin self.n_mot = n_mot self.vect_size = vect_size if isinstance(n_hidden, int): self.n_hidden = [n_hidden] else: self.n_hidden = n_hidden self.n_out = n_out self.window = Window(self.dwin, self.n_mot, self.vect_size, self.n_hidden, self.n_out, weights_init=IsotropicGaussian(0.001)) super(LookUpTrain, self).__init__(**kwargs) self.softmax = Softmax() self.error = MisclassificationRate() self.children = [self.window, self.softmax, self.error] @application(inputs=['input_'], outputs=['output']) def apply(self, input_): return self.window.apply(input_) @application(inputs=['x', 'y'], outputs=['output']) def cost(self, x, y): return self.softmax.categorical_cross_entropy(y, self.apply(x)) @application(inputs=['x', 'y'], outputs=['output']) def errors(self, x, y): return self.error.apply(y, self.apply(x)) @application(inputs=['x'], outputs=['output']) def predict(self, x): return T.argmax(self.apply(x), axis=1) @application(inputs=['x'], outputs=['output']) def predict_confidency(self, x): return T.max(self.apply(x), axis=1) def update_lookup_weights(self): self.window.update_lookup_weights() @application(inputs=['input_', 'input_corrupt'], outputs=['output']) def score(self, input_, input_corrupt): # modify the input_ with an incorrect central word ? return (1 - -self.apply(input_)).norm(2) + ( self.apply(input_corrupt)).norm(2) #return T.maximum(0,1 - self.apply(input_)+self.apply(input_corrupt) )[0] return T.maximum(0, 1 - self.apply(input_))[0] + 0.1 * T.maximum( 0, 1 + self.apply(input_corrupt))[0] + 0.1 * T.maximum( 0, 1 - self.apply(input_) + self.apply(input_corrupt))[0] # change that !!!! def _initialize(self): self.window.initialize() @application(inputs=['input_'], outputs=['output']) def embedding(self, input_): return self.window.embedding(input_) def _allocate(self): self.window.allocate() def load(self, repo, filename): params = getParams(self, T.itensor3()) with closing(open(os.path.join(repo, filename), 'rb')) as f: params_value = pickle.load(f) for p, p_value in zip(params, params_value): p.set_value(p_value.get_value()) def get_Params(self): return self.window.get_Params() def save(self, repo, filename): params = getParams(self, T.itensor3()) index = 0 while os.path.isfile(os.path.join(repo, filename + "_" + str(index))): index += 1 filename = filename + "_" + str(index) with closing(open(os.path.join(repo, filename), 'wb')) as f: pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)
rect = Rectifier() mlp = MLP(dims=[784, 1200, 1200, 200], activations=[rect, rect, rect], seed=10) mlp.weights_init = Uniform(0.0, 0.01) mlp.biases_init = Constant(0.0) mlp.initialize() lin = Linear(200, 10, use_bias=True) lin.weights_init = Uniform(0.0, 0.01) lin.biases_init = Constant(0.0) lin.initialize() train_out = lin.apply(mlp.apply(flat_x)) test_out = lin.apply(mlp.apply(flat_x)) sm = Softmax(name='softmax') loss = sm.categorical_cross_entropy(flat_y, train_out).mean() loss.name = 'nll' misclass = MisclassificationRate().apply(flat_y, train_out) misclass.name = 'misclass' test_loss = sm.categorical_cross_entropy(flat_y, test_out).mean() test_loss.name = 'nll' test_misclass = MisclassificationRate().apply(flat_y, test_out) test_misclass.name = 'misclass' model = Model(loss) ###################### # Data ###################### import numpy
#### Softmax ################### from blocks.bricks import Softmax from blocks.bricks.cost import MisclassificationRate W2 = theano.shared(numpy.random.normal(size=(n_out, num_protos)).astype('float32')) b = theano.shared(numpy.zeros((num_protos,)).astype('float32')) y = tensor.ivector('y') h = tensor.dot(h3, W2) + b h = tensor.switch(h < 0, -h , h) sm = Softmax() pred = sm.apply(h) misclass = MisclassificationRate().apply(y, pred) c = sm.categorical_cross_entropy(y, h).mean() s_params = [W2, b] s_grad = theano.grad(c, s_params) s_updates = [p - numpy.float32(0.05)*g for p, g in zip(s_params, s_grad)] s_f = theano.function([h3, y], [c, misclass], updates=zip(s_params, s_updates)) s_pred = theano.function([h3], pred) for j in range(200): for i in range(n_batches): if i == 0: print s_f(data[i*batch_size:(i+1)*batch_size, :], labels[i*batch_size:(i+1)*batch_size]) else: s_f(data[i*batch_size:(i+1)*batch_size, :], labels[i*batch_size:(i+1)*batch_size]) print b.get_value()