def apply(self, input_lb, input_un, target): batch_size = input_lb.shape[0] get_labeled = lambda x: x[:batch_size] if x is not None else x input = T.concatenate([input_lb, input_un], axis=0) self.layer_dims = {0: self.input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) top = len(self.layers) - 1 clean = self.encoder(input, noise_std=[0]) corr = self.encoder(input, noise_std=self.noise_std) ests, costs = self.decoder(clean, corr, batch_size) # Costs y = target.flatten() costs.class_clean = CategoricalCrossEntropy().apply( y, get_labeled(clean.h[top])) costs.class_clean.name = 'CE_clean' costs.class_corr = CategoricalCrossEntropy().apply( y, get_labeled(corr.h[top])) costs.class_corr.name = 'CE_corr' costs.total = costs.class_corr * 1.0 for i in range(len(self.layers)): costs.total += costs.denois[i] * self.denoising_cost_x[i] costs.total.name = 'Total_cost' self.costs = costs # Classification error mr = MisclassificationRate() self.error = mr.apply(y, get_labeled(clean.h[top])) * np.float32(100.) self.error.name = 'Error_rate'
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5("features") tensor3 = theano.tensor.TensorType(config.floatX, (False,) * 3) locs = tensor3("locs") # shape: B x Classes target = T.ivector("targets") model = LSTMAttention(configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, alpha, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply( input_, locs ) model.location = location model.scale = scale model.alpha = location model.patch = patch classifier = MLP( [Rectifier(), Softmax()], configs["classifier_dims"], weights_init=Glorot(), biases_init=Constant(0) ) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) cost.name = "CE" error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = "ER" model.cost = cost model.error_rate = error_rate model.probabilities = probabilities if configs["load_pretrained"]: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open("VGG_CNN_params.npz") as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs["test_model"]: print "TESTING THE MODEL: CHECK THE INPUT SIZE!" cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input="ignore", allow_input_downcast=True) data = configs["get_streams"](configs["batch_size"])[0].get_epoch_iterator().next() f(data[1], data[0], data[2]) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
def main(save_to, num_epochs, bokeh=False): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if bokeh: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), model=Model(cost), extensions=extensions) main_loop.run()
def apply(self, input_, target): mlp = MLP(self.non_lins, self.dims, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name=self.name) mlp.initialize() probs = mlp.apply(T.flatten(input_, outdim=2)) probs.name = 'probs' cost = CategoricalCrossEntropy().apply(target.flatten(), probs) cost.name = "CE" self.outputs = {} self.outputs['probs'] = probs self.outputs['cost'] = cost
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHTS])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent(cost=cost, step_rule=SteepestDescent(learning_rate=0.1)) main_loop = MainLoop( mlp, DataStream(mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 50)), algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_every_epoch=True), SerializeMainLoop(save_to), Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']]), Printing() ]) main_loop.run()
def main(save_to, num_epochs, batch_size): mlp = MLP([Tanh(), Tanh(), Tanh(), Softmax()], [3072, 4096, 1024, 512, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tt.tensor4('features', dtype='float32') y = tt.vector('label', dtype='int32') probs = mlp.apply(x.reshape((-1, 3072))) cost = CategoricalCrossEntropy().apply(y, probs) error_rate = MisclassificationRate().apply(y, probs) cg = ComputationGraph([cost]) ws = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * sum(([(w**2).sum() for w in ws])) cost.name = 'final_cost' train_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=True) valid_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=False) train_stream = train_dataset.get_stream(batch_size) valid_stream = valid_dataset.get_stream(batch_size) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) extensions = [ Timing(), LogExtension('/home/belohlavek/ALI/mlp.log'), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], valid_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] main_loop = MainLoop(algorithm, train_stream, model=Model(cost), extensions=extensions) main_loop.run()
def build_model(images, labels): # Construct a bottom convolutional sequence bottom_conv_sequence = convolutional_sequence((3,3), 16, (160, 160)) bottom_conv_sequence._push_allocation_config() # Flatten layer flattener = Flattener() # Construct a top MLP conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output')) #top_mlp = MLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) top_mlp = BatchNormalizedMLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) # Construct feedforward sequence ss_seq = FeedforwardSequence([bottom_conv_sequence.apply, flattener.apply, top_mlp.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost_noreg = CategoricalCrossEntropy().apply(labels.flatten(), prediction) # add regularization selector = Selector([top_mlp]) Ws = selector.get_parameters('W') mlp_brick_name = 'batchnormalizedmlp' W0 = Ws['/%s/linear_0.W' % mlp_brick_name] W1 = Ws['/%s/linear_1.W' % mlp_brick_name] cost = cost_noreg + .01 * (W0 ** 2).mean() + .01 * (W1 ** 2).mean() return cost
def __init__(self, config): self.X = T.tensor4("features") c = config seq = BrickSequence( input_dim=(3, 32, 32), bricks=[ conv3(c['n_l1']), conv3(c['n_l2']), max_pool(), conv3(c['n_l3']), conv3(c['n_l4']), max_pool(), #conv3(10), #conv3(10), Flattener(), linear(c['n_l5']), Softmax() ]) seq.initialize() self.pred = seq.apply(self.X) self.Y = T.imatrix("targets") self.cost = CategoricalCrossEntropy().apply(self.Y.flatten(), self.pred) self.cost.name = "cost" self.accur = 1.0 - MisclassificationRate().apply( self.Y.flatten(), self.pred) self.accur.name = "accur"
def build_model(images, labels): # Construct a bottom convolutional sequence bottom_conv_sequence = convolutional_sequence((3, 3), 64, (150, 150)) bottom_conv_sequence._push_allocation_config() # Flatten layer flattener = Flattener() # Construct a top MLP conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output')) top_mlp = MLP([ LeakyRectifier(name='non_linear_9'), LeakyRectifier(name='non_linear_10'), Softmax(name='non_linear_11') ], [conv_out_dim, 2048, 612, 10], weights_init=IsotropicGaussian(), biases_init=Constant(1)) # Construct feedforward sequence ss_seq = FeedforwardSequence( [bottom_conv_sequence.apply, flattener.apply, top_mlp.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost = CategoricalCrossEntropy().apply(labels.flatten(), prediction) return cost
def setup_model(): # shape: T x B x F input_ = T.tensor3('features') # shape: B target = T.lvector('targets') model = LSTMAttention(input_dim=10000, dim=500, mlp_hidden_dims=[2000, 500, 4], batch_size=100, image_shape=(100, 100), patch_shape=(28, 28), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) model.initialize() h, c = model.apply(input_) classifier = MLP([Rectifier(), Softmax()], [500, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) error_rate = MisclassificationRate().apply(target, probabilities) return cost, error_rate
def __init__(self, split_idx, domain_classifier, **kwargs): super(DomainUnsupervisedCost, self).__init__(**kwargs) batchwise_split = BatchwiseSplit(split_idx) gradient_reversal = GradientReversal() cost = CategoricalCrossEntropy() self.children = [ gradient_reversal, domain_classifier, batchwise_split, cost ]
def main(save_to, num_epochs, batch_size): mlp = MLP([Tanh(), Tanh(), Tanh(), Softmax()], [3072, 4096, 1024, 512, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tt.tensor4('features', dtype='float32') y = tt.vector('label', dtype='int32') probs = mlp.apply(x.reshape((-1,3072))) cost = CategoricalCrossEntropy().apply(y, probs) error_rate = MisclassificationRate().apply(y, probs) cg = ComputationGraph([cost]) ws = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * sum(([(w**2).sum() for w in ws])) cost.name = 'final_cost' train_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=True) valid_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=False) train_stream = train_dataset.get_stream(batch_size) valid_stream = valid_dataset.get_stream(batch_size) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) extensions = [Timing(), LogExtension('/home/belohlavek/ALI/mlp.log'), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], valid_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] main_loop = MainLoop(algorithm, train_stream, model=Model(cost), extensions=extensions) main_loop.run()
def setup_model(): # shape: T x B x F input_ = T.tensor3('features') # shape: B target = T.lvector('targets') model = LSTMAttention(dim=256, mlp_hidden_dims=[256, 4], batch_size=100, image_shape=(64, 64), patch_shape=(16, 16), weights_init=Glorot(), biases_init=Constant(0)) model.initialize() h, c, location, scale = model.apply(input_) classifier = MLP([Rectifier(), Softmax()], [256 * 2, 200, 10], weights_init=Glorot(), biases_init=Constant(0)) model.h = h model.c = c model.location = location model.scale = scale classifier.initialize() probabilities = classifier.apply(T.concatenate([h[-1], c[-1]], axis=1)) cost = CategoricalCrossEntropy().apply(target, probabilities) error_rate = MisclassificationRate().apply(target, probabilities) model.cost = cost location_x_0_avg = T.mean(location[0, :, 0]) location_x_0_avg.name = 'location_x_0_avg' location_x_10_avg = T.mean(location[10, :, 0]) location_x_10_avg.name = 'location_x_10_avg' location_x_20_avg = T.mean(location[-1, :, 0]) location_x_20_avg.name = 'location_x_20_avg' scale_x_0_avg = T.mean(scale[0, :, 0]) scale_x_0_avg.name = 'scale_x_0_avg' scale_x_10_avg = T.mean(scale[10, :, 0]) scale_x_10_avg.name = 'scale_x_10_avg' scale_x_20_avg = T.mean(scale[-1, :, 0]) scale_x_20_avg.name = 'scale_x_20_avg' monitorings = [ error_rate, location_x_0_avg, location_x_10_avg, location_x_20_avg, scale_x_0_avg, scale_x_10_avg, scale_x_20_avg ] model.monitorings = monitorings return model
def test_dataset_evaluators(): X = theano.tensor.vector('X') Y = theano.tensor.vector('Y') data = [ numpy.arange(1, 7, dtype=theano.config.floatX).reshape(3, 2), numpy.arange(11, 17, dtype=theano.config.floatX).reshape(3, 2) ] data_stream = IterableDataset(dict(X=data[0], Y=data[1])).get_example_stream() validator = DatasetEvaluator([ CrossEntropy(requires=[X, Y], name="monitored_cross_entropy0"), # to test two same quantities and make sure that state will be reset CrossEntropy(requires=[X, Y], name="monitored_cross_entropy1"), CategoricalCrossEntropy().apply(X, Y), ]) values = validator.evaluate(data_stream) numpy.testing.assert_allclose(values['monitored_cross_entropy1'], values['categoricalcrossentropy_apply_cost'])
def test_softmax_vector(): x = tensor.matrix('x') y = tensor.lvector('y') softmax_out = Softmax().apply(x) cost = CategoricalCrossEntropy().apply(y, softmax_out) cost_stable = Softmax().categorical_cross_entropy(y, x) softmax_cost_func = function([x, y], cost) softmax_cost_stable_func = function([x, y], cost_stable) batch_size = 100 x_size = 10 rng = numpy.random.RandomState(1) x_val = rng.randn(batch_size, x_size).astype(theano.config.floatX) y_val = rng.randint(low=0, high=x_size, size=(batch_size)) softmax_cost = softmax_cost_func(x_val, y_val) softmax_cost_stable = softmax_cost_stable_func(x_val, y_val) assert_allclose(softmax_cost, softmax_cost_stable)
def test_softmax_matrix(): x = tensor.matrix('x') y = tensor.matrix('y') softmax_out = Softmax().apply(x) cost = CategoricalCrossEntropy().apply(y, softmax_out) cost_stable = Softmax().categorical_cross_entropy(y, x) softmax_cost_func = function([x, y], cost) softmax_cost_stable_func = function([x, y], cost_stable) batch_size = 2 x_size = 2 rng = numpy.random.RandomState(1) x_val = rng.randn(batch_size, x_size).astype(theano.config.floatX) y_val_us = rng.uniform(size=(batch_size, x_size)).astype(theano.config.floatX) y_val = y_val_us / numpy.expand_dims(y_val_us.sum(axis=1), axis=1) softmax_cost = softmax_cost_func(x_val, y_val) softmax_cost_stable = softmax_cost_stable_func(x_val, y_val) assert_allclose(softmax_cost, softmax_cost_stable, rtol=1e-5)
def create_model(self, symbols_num = 500): # Hyperparameters # The dimension of the hidden state of the GRUs in each direction. hidden_states = self.args.encoder_hidden_dims # Dimension of the word-embedding space embedding_dims = self.args.source_embeddings_dim ################### # Declaration of the Theano variables that come from the data stream ################### # The context document. context_bt = tt.lmatrix('context') # Context document mask used to distinguish real symbols from the sequence and padding symbols that are at the end context_mask_bt = tt.matrix('context_mask') # The question question_bt = tt.lmatrix('question') question_mask_bt = tt.matrix('question_mask') # The correct answer y = tt.lmatrix('answer') y = y[:,0] # originally answers are in a 2d matrix, here we convert it to a vector # The candidates among which the answer is selected candidates_bi = tt.lmatrix("candidates") candidates_bi_mask = tt.matrix("candidates_mask") ################### # Network's components ################### # Lookup table with randomly initialized word embeddings lookup = LookupTable(symbols_num, embedding_dims, weights_init=Uniform(width=0.2)) # bidirectional encoder that translates context context_encoder = self.create_bidi_encoder("context_encoder", embedding_dims, hidden_states) # bidirectional encoder for question question_encoder = self.create_bidi_encoder("question_encoder", embedding_dims, hidden_states) # Initialize the components (where not done upon creation) lookup.initialize() ################### # Wiring the components together # # Where present, the 3 letters at the end of the variable name identify its dimensions: # b ... position of the example within the batch # t ... position of the word within the document/question # f ... features of the embedding vector ################### ### Read the context document # Map token indices to word embeddings context_embedding_tbf = lookup.apply(context_bt.T) # Read the embedded context document using the bidirectional GRU and produce the contextual embedding of each word memory_encoded_btf = context_encoder.apply(context_embedding_tbf, context_mask_bt.T).dimshuffle(1,0,2) memory_encoded_btf.name = "memory_encoded_btf" ### Correspondingly, read the query x_embedded_tbf = lookup.apply(question_bt.T) x_encoded_btf = question_encoder.apply(x_embedded_tbf, question_mask_bt.T).dimshuffle(1,0,2) # The query encoding is a concatenation of the final states of the forward and backward GRU encoder x_forward_encoded_bf = x_encoded_btf[:,-1,0:hidden_states] x_backward_encoded_bf = x_encoded_btf[:,0,hidden_states:hidden_states*2] query_representation_bf = tt.concatenate([x_forward_encoded_bf,x_backward_encoded_bf],axis=1) # Compute the attention on each word in the context as a dot product of its contextual embedding and the query mem_attention_presoft_bt = tt.batched_dot(query_representation_bf, memory_encoded_btf.dimshuffle(0,2,1)) # TODO is this pre-masking necessary? mem_attention_presoft_masked_bt = tt.mul(mem_attention_presoft_bt,context_mask_bt) # Normalize the attention using softmax mem_attention_bt = SoftmaxWithMask(name="memory_query_softmax").apply(mem_attention_presoft_masked_bt,context_mask_bt) if self.args.weighted_att: # compute weighted attention over original word vectors att_weighted_responses_bf = theano.tensor.batched_dot(mem_attention_bt, context_embedding_tbf.dimshuffle(1,0,2)) # compare desired response to all candidate responses # select relevant candidate answer words candidates_embeddings_bfi = lookup.apply(candidates_bi).dimshuffle(0,2,1) # convert it to output symbol probabilities y_hat_presoft = tt.batched_dot(att_weighted_responses_bf, candidates_embeddings_bfi) y_hat = SoftmaxWithMask(name="output_softmax").apply(y_hat_presoft,candidates_bi_mask) else: # Sum the attention of each candidate word across the whole context document, # this is the key innovation of the model # TODO: Get rid of sentence-by-sentence processing? # TODO: Rewrite into matrix notation instead of scans? def sum_prob_of_word(word_ix, sentence_ixs, sentence_attention_probs): word_ixs_in_sentence = tt.eq(sentence_ixs,word_ix).nonzero()[0] return sentence_attention_probs[word_ixs_in_sentence].sum() def sum_probs_single_sentence(candidate_indices_i, sentence_ixs_t, sentence_attention_probs_t): result, updates = theano.scan( fn=sum_prob_of_word, sequences=[candidate_indices_i], non_sequences=[sentence_ixs_t, sentence_attention_probs_t]) return result def sum_probs_batch(candidate_indices_bt,sentence_ixs_bt, sentence_attention_probs_bt): result, updates = theano.scan( fn=sum_probs_single_sentence, sequences=[candidate_indices_bt, sentence_ixs_bt, sentence_attention_probs_bt], non_sequences=None) return result # Sum the attention of each candidate word across the whole context document y_hat = sum_probs_batch(candidates_bi, context_bt, mem_attention_bt) y_hat.name = "y_hat" # We use the convention that ground truth is always at index 0, so the following are the target answers y = y.zeros_like() # We use Cross Entropy as the training objective cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost.name = "cost" predicted_response_index = tt.argmax(y_hat,axis=1) accuracy = tt.eq(y,predicted_response_index).mean() accuracy.name = "accuracy" return cost, accuracy, mem_attention_bt, y_hat, context_bt, candidates_bi, candidates_bi_mask, y, context_mask_bt, question_bt, question_mask_bt
sampler = q_sampler, classifier=classifier_mlp) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix(u'features') y = tensor.lmatrix(u'targets') #y = theano.tensor.extra_ops.to_one_hot(tensor.lmatrix(u'targets'),2) #probs, h_enc, c_enc, i_dec, h_dec, c_dec, center_y, center_x, delta = draw.reconstruct(x) probs, h_enc, c_enc, center_y, center_x, delta = draw.reconstruct(x) trim_probs = probs[-1,:,:] #Only take information from the last iteration labels = y.flatten() #cost = BinaryCrossEntropy().apply(labels, trim_probs) cost = CategoricalCrossEntropy().apply(y, trim_probs) error_rate = MisclassificationRate().apply(labels, trim_probs) cost.name = "CCE" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]), on_unused_sources='ignore',
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to, save_separately=['log'], after_batch=True), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run() import cPickle import pandas with open('mnist_log.pkl') as f: log = cPickle.load(f) data_frame = pandas.DataFrame.from_dict(log, orient='index')
weights_init=IsotropicGaussian(), biases_init=IsotropicGaussian(), prototype=input_mlp, ) parallel_nets.initialize() l_h, r_h = parallel_nets.apply(l_x=l_x, r_x=r_x) # Concatenate the inputs from the two hidden subnets into a single variable # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate: # Also apply the outputs from the net, and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # Need to define the computation graph: graph = ComputationGraph(cost) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(graph.variables) # Add some regularization to this model: lam = 0.001 cost += lam * l2_norm(W) cost.name = 'entropy' # This is the model without dropout, but with l2 reg.
# In[5]: from blocks.bricks import Linear, Logistic, Softmax # In[10]: hidden_layer_size = 100 input_to_hidden = Linear(name='input_to_hidden', input_dim=117, output_dim=hidden_layer_size) h = Logistic().apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_layer_size, output_dim=2) y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix('targets') from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate cost = CategoricalCrossEntropy().apply(y, y_hat) error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) error_rate.name = "error_rate" # >>> from blocks.roles import WEIGHT from blocks.graph import ComputationGraph # >>> from blocks.filter import VariableFilter cg = ComputationGraph(cost) # >>> W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) # >>> cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() # >>> cost.name = 'cost_with_regularization' cost.name = 'cost_simple_xentropy' from blocks.initialization import IsotropicGaussian, Constant input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0)
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = SimpleRecurrent( dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=Adam() ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) def sampling_step(g_noise, states, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states = rnn.apply(inputs=embedding_step, states=states, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_samples [_, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def run(epochs=1, corpus="data/", HIDDEN_DIMS=100, path="./"): brown = BrownDataset(corpus) INPUT_DIMS = brown.get_vocabulary_size() OUTPUT_DIMS = brown.get_vocabulary_size() # These are theano variables x = tensor.lmatrix('context') y = tensor.ivector('output') # Construct the graph input_to_hidden = LookupTable(name='input_to_hidden', length=INPUT_DIMS, dim=HIDDEN_DIMS) # Compute the weight matrix for every word in the context and then compute # the average. h = tensor.mean(input_to_hidden.apply(x), axis=1) hidden_to_output = Linear(name='hidden_to_output', input_dim=HIDDEN_DIMS, output_dim=OUTPUT_DIMS) y_hat = Softmax().apply(hidden_to_output.apply(h)) # And initialize with random varibales and set the bias vector to 0 weights = IsotropicGaussian(0.01) input_to_hidden.weights_init = hidden_to_output.weights_init = weights input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # And now the cost function cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' mini_batch = SequentialScheme(brown.num_instances(), 512) data_stream = DataStream.default_stream(brown, iteration_scheme=mini_batch) # Now we tie up lose ends and construct the algorithm for the training # and define what happens in the main loop. algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ ProgressBar(), FinishAfter(after_n_epochs=epochs), Printing(), # TrainingDataMonitoring(variables=[cost]), SaveWeights(layers=[input_to_hidden, hidden_to_output], prefixes=['%sfirst' % path, '%ssecond' % path]), # Plot( # 'Word Embeddings', # channels=[ # [ # 'cost_with_regularization' # ] # ]) ] logger.info("Starting main loop...") main = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=extensions) main.run() pickle.dump(cg, open('%scg.pickle' % path, 'wb'))
def _create_main_loop(self): # hyper parameters hp = self.params batch_size = hp['batch_size'] biases_init = Constant(0) batch_normalize = hp['batch_normalize'] ### Build fprop tensor5 = T.TensorType(config.floatX, (False,)*5) X = tensor5("images") #X = T.tensor4("images") y = T.lvector('targets') gnet_params = OrderedDict() #X_shuffled = X[:, :, :, :, [2, 1, 0]] #X_shuffled = gpu_contiguous(X.dimshuffle(0, 1, 4, 2, 3)) * 255 X = X[:, :, :, :, [2, 1, 0]] X_shuffled = X.dimshuffle((0, 1, 4, 2, 3)) * 255 X_r = X_shuffled.reshape((X_shuffled.shape[0], X_shuffled.shape[1]*X_shuffled.shape[2], X_shuffled.shape[3], X_shuffled.shape[4])) X_r = X_r - (np.array([104, 117, 123])[None, :, None, None]).astype('float32') expressions, input_data, param = stream_layer_exp(inputs = ('data', X_r), mode='rgb') res = expressions['outloss'] y_hat = res.flatten(ndim=2) import pdb; pdb.set_trace() ### Build Cost cost = CategoricalCrossEntropy().apply(y, y_hat) cost = T.cast(cost, theano.config.floatX) cost.name = 'cross_entropy' y_pred = T.argmax(y_hat, axis=1) misclass = T.cast(T.mean(T.neq(y_pred, y)), theano.config.floatX) misclass.name = 'misclass' monitored_channels = [] monitored_quantities = [cost, misclass, y_hat, y_pred] model = Model(cost) training_cg = ComputationGraph(monitored_quantities) inference_cg = ComputationGraph(monitored_quantities) ### Get evaluation function #training_eval = training_cg.get_theano_function(additional_updates=bn_updates) training_eval = training_cg.get_theano_function() #inference_eval = inference_cg.get_theano_function() # Dataset test = JpegHDF5Dataset('test', #name='jpeg_data_flows.hdf5', load_in_memory=True) #mean = np.load(os.path.join(os.environ['UCF101'], 'mean.npy')) import pdb; pdb.set_trace() ### Eval labels = np.zeros(test.num_video_examples) y_hat = np.zeros((test.num_video_examples, 101)) labels_flip = np.zeros(test.num_video_examples) y_hat_flip = np.zeros((test.num_video_examples, 101)) ### Important to shuffle list for batch normalization statistic #rng = np.random.RandomState() #examples_list = range(test.num_video_examples) #import pdb; pdb.set_trace() #rng.shuffle(examples_list) nb_frames=1 for i in xrange(24): scheme = HDF5SeqScheme(test.video_indexes, examples=test.num_video_examples, batch_size=batch_size, f_subsample=i, nb_subsample=25, frames_per_video=nb_frames) #for crop in ['upleft', 'upright', 'downleft', 'downright', 'center']: for crop in ['center']: stream = JpegHDF5Transformer( input_size=(240, 320), crop_size=(224, 224), #input_size=(256, 342), crop_size=(224, 224), crop_type=crop, translate_labels = True, flip='noflip', nb_frames = nb_frames, data_stream=ForceFloatX(DataStream( dataset=test, iteration_scheme=scheme))) stream_flip = JpegHDF5Transformer( input_size=(240, 320), crop_size=(224, 224), #input_size=(256, 342), crop_size=(224, 224), crop_type=crop, translate_labels = True, flip='flip', nb_frames = nb_frames, data_stream=ForceFloatX(DataStream( dataset=test, iteration_scheme=scheme))) ## Do the evaluation epoch = stream.get_epoch_iterator() for j, batch in enumerate(epoch): output = training_eval(batch[0], batch[1]) # import cv2 # cv2.imshow('img', batch[0][0, 0, :, :, :]) # cv2.waitKey(160) # cv2.destroyAllWindows() #import pdb; pdb.set_trace() labels_flip[batch_size*j:batch_size*(j+1)] = batch[1] y_hat_flip[batch_size*j:batch_size*(j+1), :] += output[2] preds = y_hat_flip.argmax(axis=1) misclass = np.sum(labels_flip != preds) / float(len(preds)) print i, crop, "flip Misclass:", misclass epoch = stream_flip.get_epoch_iterator() for j, batch in enumerate(epoch): output = training_eval(batch[0], batch[1]) labels[batch_size*j:batch_size*(j+1)] = batch[1] y_hat[batch_size*j:batch_size*(j+1), :] += output[2] preds = y_hat.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print i, crop, "noflip Misclass:", misclass y_merge = y_hat + y_hat_flip preds = y_merge.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print i, crop, "avg Misclass:", misclass ### Compute misclass y_hat += y_hat_flip preds = y_hat.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print "Misclass:", misclass
def shroom_mlp(shrooms_train, shrooms_test, num_epochs, hidden_dims, activation_function): # These are theano variables x = tensor.matrix('features') y = tensor.lmatrix('targets') # Construct the graph input_to_hidden = Linear(name='input_to_hidden', input_dim=117, output_dim=hidden_dims) h = activation_function.apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_dims, output_dim=2) y_hat = Softmax().apply(hidden_to_output.apply(h)) # And initialize with random varibales and set the bias vector to 0 weights = IsotropicGaussian(0.01) input_to_hidden.weights_init = hidden_to_output.weights_init = weights input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # And now the cost function cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) # Not needed for now: W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) # The data streams give us access to our corpus and allow us to perform a # mini-batch training. data_stream = Flatten(DataStream.default_stream( shrooms_train, iteration_scheme=SequentialScheme(shrooms_train.num_examples, batch_size=128))) test_data_stream = Flatten(DataStream.default_stream( shrooms_test, iteration_scheme=SequentialScheme(shrooms_test.num_examples, batch_size=1000))) extensions = [ ProgressBar(), PlotWeights(after_epoch=True, folder="results_logistic_40_interpolation", computation_graph=cg, folder_per_layer=True, dpi=150), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring(variables=[cost, error_rate], data_stream=test_data_stream, prefix="test"), Printing() ] # Now we tie up lose ends and construct the algorithm for the training # and define what happens in the main loop. algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) main = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=extensions) main.run() return 0
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError('z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)') z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream(IndexableDataset(indexables=OrderedDict([ ('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord( stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data',) * 3 + stream.sources + ('zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream,) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*4, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d'%l, ogates_zoneout=ogates_zoneout) for l in range(num_layers)] elif rnn_type.lower() == 'gru': in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*3, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d'%l) for l in range(num_layers)] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d'%l) for l in range(num_layers)] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply(rnn_embedding, zoneouts_states[:, :, l * layer_size : (l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size : (l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size : (l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply( y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost/np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt(T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states'%l in [o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states'%l: norms = _magnitude(output) norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy('cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x*x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [cost_train, cost, bpc, perp, learning_rate, aggregation.mean(algorithm.total_gradient_norm).copy("gradient_norm_mean")] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring( variables=observed_vars, prefix="train", after_epoch=True ) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring( variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates ) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend([FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates ) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append(SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append(RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value()/lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError('Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) hidden_units = int(config.get('hyperparams', 'hidden_units', 16)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') fine_tune = config.getboolean('hyperparams', 'fine_tune') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13' ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45' right_dim = 10519 left_dim = 11427 train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') l_x = tensor.matrix('l_features') r_x = tensor.matrix('r_features') y = tensor.lmatrix('targets') lnet = load(ln_file).model.get_top_bricks()[0] rnet = load(rn_file).model.get_top_bricks()[0] # Pre-trained layers: # Inputs -> hidden_1 -> hidden 2 for side, net in zip(['l', 'r'], [lnet, rnet]): for child in net.children: child.name = side + '_' + child.name ll1 = lnet.children[0] lr1 = lnet.children[1] ll2 = lnet.children[2] lr2 = lnet.children[3] rl1 = rnet.children[0] rr1 = rnet.children[1] rl2 = rnet.children[2] rr2 = rnet.children[3] l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x)))) r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x)))) input_dim = ll2.output_dim + rl2.output_dim # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output output_mlp = MLP(activations=[ Rectifier(name='h3'), Rectifier(name='h4'), Softmax(name='output'), ], dims=[ input_dim, hidden_units, hidden_units, 2, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(std=W_sd, mean=W_mu)) output_mlp.initialize() # # Concatenate the inputs from the two hidden subnets into a single variable # # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) # y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # If no fine-tuning of l-r models is wanted, find the params for only # the joint layers: if fine_tune: params_to_update = dropout_graph.parameters else: params_to_update = VariableFilter([PARAMETER], bricks=output_mlp.children)(cost_graph) # Learning Algorithm: algo = GradientDescent( step_rule=solver_type, params=params_to_update, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_LeftRight', channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], ) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}'.format(stamp), save_separately=['model', 'log'], every_n_epochs=1) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), FinishIfNoImprovementAfter(notification_name='validation_error', epochs=1), Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def main(): parser = argparse.ArgumentParser() parser.add_argument("every_n_batches", type=int, default=[1], nargs=1) args = parser.parse_args() print("We were asked to sync with legion at every_n_batches = %s" % str(args.every_n_batches[0])) # The rest is a copy paste from the blocks tutorial, except for the inclusion of the sync extension # at the creation of the MainLoop blocks object. x = tensor.matrix('features') input_to_hidden = Linear(name='input_to_hidden', input_dim=784, output_dim=100) h = Rectifier().apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name='hidden_to_output', input_dim=100, output_dim=10) y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix('targets') cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() mnist = MNIST(("train",)) data_stream = Flatten( DataStream.default_stream( mnist, iteration_scheme=SequentialScheme(mnist.num_examples, batch_size=256))) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=Scale(learning_rate=0.1) ) mnist_test = MNIST(("test",)) data_stream_test = Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, batch_size=1024))) monitor = DataStreamMonitoring(variables=[cost], data_stream=data_stream_test, prefix="test") # Except for this line b1, b2 = VariableFilter(roles=[BIAS])(cg.variables) main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=[monitor, FinishAfter(after_n_epochs=500), Printing(), # And the inclusion of the legion sync module, SharedParamsRateLimited: SharedParamsRateLimited( params={"W1": W1, "W2": W2, "b1": b1, "b2": b2 }, alpha=.5, beta=.5, every_n_batches=args.every_n_batches[0], maximum_rate=0.1)]) main_loop.run()
def build_submodel(input_shape, output_dim, L_dim_conv_layers, L_filter_size, L_pool_size, L_activation_conv, L_dim_full_layers, L_activation_full, L_exo_dropout_conv_layers, L_exo_dropout_full_layers, L_endo_dropout_conv_layers, L_endo_dropout_full_layers, L_border_mode=None, L_filter_step=None, L_pool_step=None): # TO DO : target size and name of the features x = T.tensor4('features') y = T.imatrix('targets') assert len(input_shape) == 3, "input_shape must be a 3d tensor" num_channels = input_shape[0] image_size = tuple(input_shape[1:]) print image_size print num_channels prediction = output_dim # CONVOLUTION output_conv = x output_dim = num_channels*np.prod(image_size) conv_layers = [] assert len(L_dim_conv_layers) == len(L_filter_size) if L_filter_step is None: L_filter_step = [None] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_pool_size) if L_pool_step is None: L_pool_step = [None] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_pool_step) assert len(L_dim_conv_layers) == len(L_activation_conv) if L_border_mode is None: L_border_mode = ["valid"] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_border_mode) assert len(L_dim_conv_layers) == len(L_endo_dropout_conv_layers) assert len(L_dim_conv_layers) == len(L_exo_dropout_conv_layers) # regarding the batch dropout : the dropout is applied on the filter # which is equivalent to the output dimension # you have to look at the dropout_rate of the next layer # that is why we need to have the first dropout value of L_exo_dropout_full_layers # the first value has to be 0.0 in this context, and we'll # assume that it is, but let's have an assert assert L_exo_dropout_conv_layers[0] == 0.0, "L_exo_dropout_conv_layers[0] has to be 0.0 in this context. There are ways to make it work, of course, but we don't support this with this scripts." # here modifitication of L_exo_dropout_conv_layers L_exo_dropout_conv_layers = L_exo_dropout_conv_layers[1:] + [L_exo_dropout_full_layers[0]] if len(L_dim_conv_layers): for (num_filters, filter_size, filter_step, pool_size, pool_step, activation_str, border_mode, dropout, index) in zip(L_dim_conv_layers, L_filter_size, L_filter_step, L_pool_size, L_pool_step, L_activation_conv, L_border_mode, L_exo_dropout_conv_layers, xrange(len(L_dim_conv_layers)) ): # convert filter_size and pool_size in tuple filter_size = tuple(filter_size) if filter_step is None: filter_step = (1, 1) else: filter_step = tuple(filter_step) if pool_size is None: pool_size = (0,0) else: pool_size = tuple(pool_size) # TO DO : leaky relu if activation_str.lower() == 'rectifier': activation = Rectifier().apply elif activation_str.lower() == 'tanh': activation = Tanh().apply elif activation_str.lower() in ['sigmoid', 'logistic']: activation = Logistic().apply elif activation_str.lower() in ['id', 'identity']: activation = Identity().apply else: raise Exception("unknown activation function : %s", activation_str) assert 0.0 <= dropout and dropout < 1.0 num_filters = num_filters - int(num_filters*dropout) print "border_mode : %s" % border_mode # filter_step # http://blocks.readthedocs.org/en/latest/api/bricks.html#module-blocks.bricks.conv kwargs = {} if filter_step is None or filter_step == (1,1): pass else: # there's a bit of a mix of names because `Convolutional` takes # a "step" argument, but `ConvolutionActivation` takes "conv_step" argument kwargs['conv_step'] = filter_step if (pool_size[0] == 0 and pool_size[1] == 0): layer_conv = ConvolutionalActivation(activation=activation, filter_size=filter_size, num_filters=num_filters, border_mode=border_mode, name="layer_%d" % index, **kwargs) else: if pool_step is None: pass else: kwargs['pooling_step'] = tuple(pool_step) layer_conv = ConvolutionalLayer(activation=activation, filter_size=filter_size, num_filters=num_filters, border_mode=border_mode, pooling_size=pool_size, name="layer_%d" % index, **kwargs) conv_layers.append(layer_conv) convnet = ConvolutionalSequence(conv_layers, num_channels=num_channels, image_size=image_size, weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="conv_section") convnet.push_allocation_config() convnet.initialize() output_dim = np.prod(convnet.get_dim('output')) output_conv = convnet.apply(output_conv) output_conv = Flattener().apply(output_conv) # FULLY CONNECTED output_mlp = output_conv full_layers = [] assert len(L_dim_full_layers) == len(L_activation_full) assert len(L_dim_full_layers) + 1 == len(L_endo_dropout_full_layers) assert len(L_dim_full_layers) + 1 == len(L_exo_dropout_full_layers) # reguarding the batch dropout : the dropout is applied on the filter # which is equivalent to the output dimension # you have to look at the dropout_rate of the next layer # that is why we throw away the first value of L_exo_dropout_full_layers L_exo_dropout_full_layers = L_exo_dropout_full_layers[1:] pre_dim = output_dim print "When constructing the model, the output_dim of the conv section is %d." % output_dim if len(L_dim_full_layers): for (dim, activation_str, dropout, index) in zip(L_dim_full_layers, L_activation_full, L_exo_dropout_full_layers, range(len(L_dim_conv_layers), len(L_dim_conv_layers)+ len(L_dim_full_layers)) ): # TO DO : leaky relu if activation_str.lower() == 'rectifier': activation = Rectifier().apply elif activation_str.lower() == 'tanh': activation = Tanh().apply elif activation_str.lower() in ['sigmoid', 'logistic']: activation = Logistic().apply elif activation_str.lower() in ['id', 'identity']: activation = Identity().apply else: raise Exception("unknown activation function : %s", activation_str) assert 0.0 <= dropout and dropout < 1.0 dim = dim - int(dim*dropout) print "When constructing the fully-connected section, we apply dropout %f to add an MLP going from pre_dim %d to dim %d." % (dropout, pre_dim, dim) layer_full = MLP(activations=[activation], dims=[pre_dim, dim], weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="layer_%d" % index) layer_full.initialize() full_layers.append(layer_full) pre_dim = dim for layer in full_layers: output_mlp = layer.apply(output_mlp) output_dim = L_dim_full_layers[-1] - int(L_dim_full_layers[-1]*L_exo_dropout_full_layers[-1]) # COST FUNCTION output_layer = Linear(output_dim, prediction, weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="layer_"+str(len(L_dim_conv_layers)+ len(L_dim_full_layers)) ) output_layer.initialize() full_layers.append(output_layer) y_pred = output_layer.apply(output_mlp) y_hat = Softmax().apply(y_pred) # SOFTMAX and log likelihood y_pred = Softmax().apply(y_pred) # be careful. one version expects the output of a softmax; the other expects just the # output of the network cost = CategoricalCrossEntropy().apply(y.flatten(), y_pred) #cost = Softmax().categorical_cross_entropy(y.flatten(), y_pred) cost.name = "cost" # Misclassification error_rate_brick = MisclassificationRate() error_rate = error_rate_brick.apply(y.flatten(), y_hat) error_rate.name = "error_rate" # put names D_params, D_kind = build_params(x, T.matrix(), conv_layers, full_layers) # test computation graph cg = ComputationGraph(cost) # DROPOUT L_endo_dropout = L_endo_dropout_conv_layers + L_endo_dropout_full_layers cg_dropout = cg inputs = VariableFilter(roles=[INPUT])(cg.variables) for (index, drop_rate) in enumerate(L_endo_dropout): for input_ in inputs: m = re.match(r"layer_(\d+)_apply.*", input_.name) if m and index == int(m.group(1)): if drop_rate < 0.0001: print "Skipped applying dropout on %s because the dropout rate was under 0.0001." % input_.name break else: cg_dropout = apply_dropout(cg, [input_], drop_rate) print "Applied dropout %f on %s." % (drop_rate, input_.name) break cg = cg_dropout return (cg, error_rate, cost, D_params, D_kind)
from fuel.streams import DataStream from fuel.schemes import SequentialScheme from fuel.transformers import Flatten # Construct the model mlp = MLP(activations=[Tanh(), Softmax()], dims=[784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() # Calculate the loss function x = T.matrix('features') y = T.lmatrix('targets') y_hat = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error_rate = MisclassificationRate().apply(y.flatten(), y_hat) # load training data using Fuel mnist_train = MNIST("train") train_stream = Flatten( DataStream.default_stream(dataset=mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 128)), ) # load testing data mnist_test = MNIST("test") test_stream = Flatten( DataStream.default_stream(dataset=mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 1024)), )
update_prob=update_prob, name="rnn") h1, sd = rnn.apply(pre_rnn[:, :, :h_dim], pre_rnn[:, :, h_dim:], drops, is_for_test) h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=y_dim) pre_softmax = h1_to_o.apply(h1) softmax = Softmax() shape = pre_softmax.shape softmax_out = softmax.apply(pre_softmax.reshape((-1, y_dim))) softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y, softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y, softmax_out[-1]) error_rate.name = 'error_rate' # Initialization for brick in (x_to_h1, h1_to_o, rnn): brick.weights_init = Glorot() brick.biases_init = Constant(0) brick.initialize() train_stream = get_stream('train', batch_size, h_dim, False) data = train_stream.get_epoch_iterator(as_dict=True).next() cg = ComputationGraph(cost) f = theano.function(cg.inputs, cost) print f(data['y'], data['x'], data['is_for_test'], data['drops'])
def main(save_to, num_epochs): mlp = MLP([Tanh(), Tanh(), Softmax()], [784, 100, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost, error_rate]) cost.name = 'final_cost' test_cost = cost for_dropout = VariableFilter(roles=[INPUT], bricks=mlp.linear_transformations[1:])(cg.variables) dropout_graph = apply_dropout(cg, for_dropout, 0.5) dropout_graph = apply_dropout(dropout_graph, [x], 0.1) dropout_cost, dropout_error_rate = dropout_graph.outputs mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) algorithm = GradientDescent( cost=dropout_cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), TrainingDataMonitoring( [dropout_cost, dropout_error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(dropout_cost), extensions=extensions) main_loop.run()
def main(job_id, params, config_file='params.ec'): config = ConfigParser.ConfigParser() config.readfp(open('./configs/{}'.format(config_file))) pr = pprint.PrettyPrinter(indent=4) pr.pprint(config) net_name = config.get('hyperparams', 'net_name', 'adni') struct_name = net_name.split('_')[0] max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float(config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') input_dim = input_dims[struct_name] # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[ input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent( step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('{}_{}'.format(net_name, side), channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}/{}/{}'.format(struct_name, side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit: # And by that I mean if the means of the val error and training error over the # previous 'epochs' is greater than the 'threshold', we are overfitting. early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.05, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def _create_main_loop(self): # hyper parameters hp = self.params batch_size = hp['batch_size'] biases_init = Constant(0) batch_normalize = hp['batch_normalize'] ### Build fprop tensor5 = T.TensorType(config.floatX, (False, ) * 5) X = tensor5("images") #X = T.tensor4("images") y = T.lvector('targets') gnet_params = OrderedDict() #X_shuffled = X[:, :, :, :, [2, 1, 0]] #X_shuffled = gpu_contiguous(X.dimshuffle(0, 1, 4, 2, 3)) * 255 X = X[:, :, :, :, [2, 1, 0]] X_shuffled = X.dimshuffle((0, 1, 4, 2, 3)) * 255 X_r = X_shuffled.reshape( (X_shuffled.shape[0], X_shuffled.shape[1] * X_shuffled.shape[2], X_shuffled.shape[3], X_shuffled.shape[4])) X_r = X_r - (np.array([104, 117, 123])[None, :, None, None]).astype('float32') expressions, input_data, param = stream_layer_exp(inputs=('data', X_r), mode='rgb') res = expressions['outloss'] y_hat = res.flatten(ndim=2) import pdb pdb.set_trace() ### Build Cost cost = CategoricalCrossEntropy().apply(y, y_hat) cost = T.cast(cost, theano.config.floatX) cost.name = 'cross_entropy' y_pred = T.argmax(y_hat, axis=1) misclass = T.cast(T.mean(T.neq(y_pred, y)), theano.config.floatX) misclass.name = 'misclass' monitored_channels = [] monitored_quantities = [cost, misclass, y_hat, y_pred] model = Model(cost) training_cg = ComputationGraph(monitored_quantities) inference_cg = ComputationGraph(monitored_quantities) ### Get evaluation function #training_eval = training_cg.get_theano_function(additional_updates=bn_updates) training_eval = training_cg.get_theano_function() #inference_eval = inference_cg.get_theano_function() # Dataset test = JpegHDF5Dataset( 'test', #name='jpeg_data_flows.hdf5', load_in_memory=True) #mean = np.load(os.path.join(os.environ['UCF101'], 'mean.npy')) import pdb pdb.set_trace() ### Eval labels = np.zeros(test.num_video_examples) y_hat = np.zeros((test.num_video_examples, 101)) labels_flip = np.zeros(test.num_video_examples) y_hat_flip = np.zeros((test.num_video_examples, 101)) ### Important to shuffle list for batch normalization statistic #rng = np.random.RandomState() #examples_list = range(test.num_video_examples) #import pdb; pdb.set_trace() #rng.shuffle(examples_list) nb_frames = 1 for i in xrange(24): scheme = HDF5SeqScheme(test.video_indexes, examples=test.num_video_examples, batch_size=batch_size, f_subsample=i, nb_subsample=25, frames_per_video=nb_frames) #for crop in ['upleft', 'upright', 'downleft', 'downright', 'center']: for crop in ['center']: stream = JpegHDF5Transformer( input_size=(240, 320), crop_size=(224, 224), #input_size=(256, 342), crop_size=(224, 224), crop_type=crop, translate_labels=True, flip='noflip', nb_frames=nb_frames, data_stream=ForceFloatX( DataStream(dataset=test, iteration_scheme=scheme))) stream_flip = JpegHDF5Transformer( input_size=(240, 320), crop_size=(224, 224), #input_size=(256, 342), crop_size=(224, 224), crop_type=crop, translate_labels=True, flip='flip', nb_frames=nb_frames, data_stream=ForceFloatX( DataStream(dataset=test, iteration_scheme=scheme))) ## Do the evaluation epoch = stream.get_epoch_iterator() for j, batch in enumerate(epoch): output = training_eval(batch[0], batch[1]) # import cv2 # cv2.imshow('img', batch[0][0, 0, :, :, :]) # cv2.waitKey(160) # cv2.destroyAllWindows() #import pdb; pdb.set_trace() labels_flip[batch_size * j:batch_size * (j + 1)] = batch[1] y_hat_flip[batch_size * j:batch_size * (j + 1), :] += output[2] preds = y_hat_flip.argmax(axis=1) misclass = np.sum(labels_flip != preds) / float(len(preds)) print i, crop, "flip Misclass:", misclass epoch = stream_flip.get_epoch_iterator() for j, batch in enumerate(epoch): output = training_eval(batch[0], batch[1]) labels[batch_size * j:batch_size * (j + 1)] = batch[1] y_hat[batch_size * j:batch_size * (j + 1), :] += output[2] preds = y_hat.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print i, crop, "noflip Misclass:", misclass y_merge = y_hat + y_hat_flip preds = y_merge.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print i, crop, "avg Misclass:", misclass ### Compute misclass y_hat += y_hat_flip preds = y_hat.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print "Misclass:", misclass
# MLP mlp = MLP(activations=[Logistic(name='sigmoid_0'), Softmax(name='softmax_1')], dims=[ 256, 256, 256, 2], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) [child.name for child in mlp.children] ['linear_0', 'sigmoid_0', 'linear_1', 'softmax_1'] Y = mlp.apply(conv_features) mlp.initialize() # Setting up the cost function from blocks.bricks.cost import CategoricalCrossEntropy cost = CategoricalCrossEntropy().apply(T.flatten(), Y) from blocks.roles import WEIGHT from blocks.graph import ComputationGraph from blocks.filter import VariableFilter cg = ComputationGraph(cost) print(VariableFilter(roles=[WEIGHT])(cg.variables)) W1, W2, W3 = VariableFilter(roles=[WEIGHT])(cg.variables) # cost with L2 regularization cost = cost + 0.005 * (W2 ** 2).sum() + 0.005 * (W3 ** 2).sum() cost.name = 'cost_with_regularization' #print(cg.variables) #print(VariableFilter(roles=[WEIGHT])(cg.variables)) # Use Blocks to train this network
o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=l.get_dim("output"), output_dim=10, weights_init=IsotropicGaussian(std=0.01), biases_init=IsotropicGaussian(std=0.01)) l.initialize() o = l.apply(o) o = Softmax().apply(o) Y = T.imatrix(name="targets") cost = CategoricalCrossEntropy().apply(Y.flatten(), o) cost.name = "cost" miss_class = 1.0 - MisclassificationRate().apply(Y.flatten(), o) miss_class.name = "accuracy" cg = ComputationGraph(cost) print cg.shared_variables bricks = [get_brick(var) for var in cg.variables if get_brick(var)] for i, b in enumerate(bricks): b.name += str(i) step_rule = AdaM() algorithm = GradientDescent(cost=cost, step_rule=step_rule)
def main(save_to, num_epochs, weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 prior_noise_level = -10 noise_step_rule = Scale(1e-6) noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX)) convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True, noise_rate=noise_rate, prior_noise_level=prior_noise_level) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): with training_noise(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # for annealing nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' # Compute noise rates for training graph train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean() train_mean_log_sigma.name = 'mean_log_sigma' train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean() train_nit_rate.name = 'nit_rate' train_nit_regularization = nit_penalty * train_nit_rate train_nit_regularization.name = 'nit_regularization' # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) mask_parameters = [p for p in trainable_parameters if get_brick(p).name == 'mask'] noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) nonmask_weights = [p for p in weights if get_brick(p).name != 'mask'] l2_norm = sum([(W ** 2).sum() for W in nonmask_weights]) l2_norm.name = 'l2_norm' l2_regularization = weight_decay * l2_norm l2_regularization.name = 'l2_regularization' # testversion test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + l2_regularization + train_nit_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 128 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # Create a step rule that reduces the learning rate of noise scale_mask = Restrict(noise_step_rule, mask_parameters) step_rule = CompositeRule([scale_mask, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) exp_name = save_to.replace('.%d', '') # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (50, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), EpochSchedule(noise_step_rule.learning_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4) ]), EpochSchedule(noise_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4), # (6, 3e-4), # (8, 1e-3), # Causes nit rate to jump # (10, 3e-3), # (12, 1e-2), # (15, 3e-2), # (19, 1e-1), # (24, 3e-1), # (30, 1) ]), NoiseExtension( noise_parameters=noise_parameters), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_regularization, train_nit_regularization, momentum.learning_rate, train_mean_log_sigma, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + exp_name, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_nit_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ['train_mean_log_sigma'], ], every_n_batches=17), Plot('Test performance for ' + exp_name, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(exp_name, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
# In[40]: x = tensor.matrix("features") from blocks.bricks import Linear, Rectifier, Softmax input_to_hidden = Linear(name="input_to_hidden", input_dim=784, output_dim=50) h = Rectifier().apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name="hidden_to_output", input_dim=50, output_dim=10) y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix("targets") from blocks.bricks.cost import CategoricalCrossEntropy cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) from blocks.roles import WEIGHT from blocks.graph import ComputationGraph from blocks.filter import VariableFilter cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = "cost_with_regularization" from blocks.initialization import IsotropicGaussian, Constant input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize()
def main(save_to, model, train, test, num_epochs, input_size = (150,150), learning_rate=0.01, batch_size=50, num_batches=None, flatten_stream=False): """ save_to : where to save trained model model : model given in input must be already initialised (works with convnet and mlp) input_size : the shape of the reshaped image in input (before flattening is applied if flatten_stream is True) """ if flatten_stream : x = tensor.matrix('image_features') else : x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') #Data augmentation #insert data augmentation here #Generating stream train_stream = DataStream.default_stream( train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size) ) test_stream = DataStream.default_stream( test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size) ) #Reshaping procedure #Add a crop option in scikitresize so that the image is not deformed #Resize to desired square shape train_stream = ScikitResize(train_stream, input_size, which_sources=('image_features',)) test_stream = ScikitResize(test_stream, input_size, which_sources=('image_features',)) #Flattening the stream if flatten_stream is True: train_stream = Flatten(train_stream, which_sources=('image_features',)) test_stream = Flatten(test_stream, which_sources=('image_features',)) # Apply input to model probs = model.apply(x) #Defining cost and various indices to watch #print(probs) #cost = SquaredError().apply(y.flatten(),probs) cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost') error_rate = MisclassificationRate().apply(y.flatten(), probs).copy( name='error_rate') #Building Computation Graph cg = ComputationGraph([cost, error_rate]) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learning_rate)) #Defining extensions extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), TrainingDataMonitoring([cost, error_rate,aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=5), DataStreamMonitoring([cost, error_rate],test_stream,prefix="test", every_n_batches=25), Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=5)] # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. model = Model(cost) main_loop = MainLoop( algorithm, train_stream, model=model, extensions=extensions) main_loop.run()
def start(self): xx = T.matrix('features', config.floatX) yy = T.imatrix('targets') zm = BNUM*(xx.shape[0]//BNUM) x = xx[:zm].reshape((BNUM, zm//BNUM, xx.shape[1])).dimshuffle(1, 0, 2) y = yy[:zm].reshape((BNUM, zm//BNUM)).dimshuffle(1, 0) # x = xx[:zm].reshape((zm//16, 16, xx.shape[1])) # y = yy[:zm].reshape((zm//16, 16)) DIMS = [108*5, 200, 200, 200, LABEL] NUMS = [1, 1, 1, 1, 1] # DIMS = [108*5, 48] # NUMS = [1, 1] FUNCS = [ Rectifier, Rectifier, Rectifier, # Rectifier, # Rectifier, # Maxout(num_pieces=5), # Maxout(num_pieces=5), # Maxout(num_pieces=5), # SimpleRecurrent, # SimpleRecurrent, # SimpleRecurrent, # LSTM, # LSTM, # LSTM, # SequenceGenerator, # Softmax, None, ] def lllistool(i, inp, func): if func == LSTM: NUMS[i+1] *= 4 sdim = DIMS[i] if func == SimpleRecurrent or func == LSTM: sdim = DIMS[i] + DIMS[i+1] l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], weights_init=IsotropicGaussian(std=sdim**(-0.5)), biases_init=IsotropicGaussian(std=sdim**(-0.5)), name='Lin{}'.format(i)) l.initialize() if func == SimpleRecurrent: gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() ret = gong.apply(l.apply(inp)) elif func == LSTM: gong = func(dim=DIMS[i+1], activation=Tanh(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() print(inp) ret, _ = gong.apply( l.apply(inp), T.zeros((inp.shape[1], DIMS[i+1])), T.zeros((inp.shape[1], DIMS[i+1])), ) elif func == SequenceGenerator: gong = func( readout=None, transition=SimpleRecurrent(dim=100, activation=Rectifier(), weights_init=IsotropicGaussian(std=0.1))) ret = None elif func == None: ret = l.apply(inp) else: gong = func() ret = gong.apply(l.apply(inp)) return ret oup = x for i in range(len(DIMS)-1): oup = lllistool(i, oup, FUNCS[i]) y_hat = oup y_rsp = y.reshape((y.shape[0]*y.shape[1],)) y_dsf_rsp = y.dimshuffle(1, 0).reshape((y.shape[0]*y.shape[1],)) yh_rsp = y_hat.reshape((y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2])) yh_dsf_rsp = y_hat.dimshuffle(1, 0, 2).reshape((y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2])) sfmx = Softmax().apply(yh_rsp) # cost = CategoricalCrossEntropy().apply(y, y_hat).astype(config.floatX) # j, wlh = Yimumu(y_hat, y) # cost = CategoricalCrossEntropy().apply(y_rsp, sfmx) + j cost = CategoricalCrossEntropy().apply(y_rsp, sfmx) # cost_p = cost_p.astype(config.floatX) # cost = CTC_cost(y, y_hat) cost = cost.astype(config.floatX) cg = ComputationGraph(cost) # cg_p = ComputationGraph(cost_p) orig_cg = cg ips = VariableFilter(roles=[INPUT])(cg.variables) ops = VariableFilter(roles=[OUTPUT])(cg.variables) # print(ips, ops) # cg = apply_dropout(cg, ips[0:2:1], 0.2) # cg = apply_dropout(cg, ips[2:-2:1], 0.5) # cost = cg.outputs[0].astype(config.floatX) cost.name = 'cost' mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)])) # yh_dsf_rsp = theano.printing.Print('YapYapYap')(yh_dsf_rsp) # z_hat = T.argmax(yh_dsf_rsp[:,:-1], axis=1) z_hat = T.argmax(yh_dsf_rsp, axis=1) # z_hat = theano.printing.Print('Yap')(z_hat) # z_hat = Yimumu_Decode()(y_hat, wlh) z_hat_hat = CTC_Decode()(y_hat) y39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[y_dsf_rsp]) y_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat]) y_hat_hat39 = y_hat39 # y_hat_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat_hat]) # trm = TrimOp()(y_hat_hat39) # trm = trm[1:1+trm[0]] # trm = theano.printing.Print('Trm')(trm) lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) lost01.name = '0/1 loss' lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) lost23.name = '2/3 loss' edit01 = EditDistance()(y39, y_hat_hat39).astype(config.floatX) #+ T.sum(trm) * 1E-10 # edit01 = edit01.astype(config.floatX) edit01.name = '0/1 edit' edit23 = EditDistance()(y39, y_hat_hat39).astype(config.floatX) edit23.name = '2/3 edit' Ws = cg.parameters # Ws = Ws + [wlh] print(list(Ws)) norms = sum(w.norm(2) for w in Ws) norms = norms.astype(config.floatX) norms.name = 'norms' path = pjoin(PATH['fuel'], 'train_train.hdf5') data = H5PYDataset(path, which_set='train', load_in_memory=True, subset=slice(0, 100000)) # data = H5PYDataset(path, which_set='train', load_in_memory=True) data_v = H5PYDataset(pjoin(PATH['fuel'], 'train_validate.hdf5'), which_set='validate', load_in_memory=True) num = data.num_examples data_stream = DataStream(data, iteration_scheme=ShuffledScheme( num, batch_size=SLEN*BNUM)) data_stream_v = DataStream(data_v, iteration_scheme=SequentialScheme( data_v.num_examples, batch_size=SLEN*BNUM)) algo = GradientDescent(cost=cost, params=Ws, step_rule=CompositeRule([ Momentum(0.005, 0.9) # AdaDelta() ])) # algo_p = GradientDescent(cost=cost_p, params=cg_p.parameters, step_rule=CompositeRule([ # Momentum(0.01, 0.9) # # AdaDelta() # ])) monitor = DataStreamMonitoring( variables=[cost, lost01, edit01, norms], data_stream=data_stream) monitor_v = DataStreamMonitoring( variables=[lost23, edit23], data_stream=data_stream_v) plt = Plot('AlpYap', channels=[['0/1 loss', '2/3 loss'], ['0/1 edit', '2/3 edit']], after_epoch=True) # main_loop_p = MainLoop(data_stream = data_stream, # algorithm=algo_p, # extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=10), Printing(), plt]) # main_loop_p.run() main_loop = MainLoop(data_stream = data_stream, algorithm=algo, extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=2000), Printing(), plt]) main_loop.run() pfile = open('zzz.pkl', 'wb') pickle.dump(orig_cg, pfile) # pickle.dump(wlh, pfile) pfile.close() ################ test_feat = np.load(pjoin(PATH['numpy'], 'train_test_features.npy')).astype(config.floatX) func = theano.function([xx], y_hat.astype(config.floatX)) test_hat = [] for i in range(19): tmp = func(test_feat[i*10000:(i+1)*10000]) tmp = tmp.transpose((1, 0, 2)).reshape((tmp.shape[0]*tmp.shape[1], tmp.shape[2])) test_hat.append(tmp) test_hat = np.concatenate(test_hat, axis=0) test_hat = np.concatenate((test_hat, np.zeros((2, LABEL))), axis=0) alpha = T.tensor3(config.floatX) beta = alpha.argmax(axis=2) # beta = alpha[:,:,:-1].argmax(axis=2) # beta = Yimumu_Decode()(alpha, wlh) # beta = CTC_Decode()(alpha) func2 = theano.function([alpha], beta) lens = [] tags = [] with shelve.open(SHELVE['test']) as f: names = f['names'] for n in names: lens.append(len(f[n])) for i in range(lens[-1]): tags.append(n+'_'+str(i+1)) seq = [] seq2 = [] nowcnt = 0 for i in lens: nxt = nowcnt + i cur_hat = test_hat[nowcnt:nxt].reshape((i, 1, LABEL)).astype(config.floatX) nowcnt = nxt fc2 = func2(cur_hat).flatten() fc3 = [] fc4 = [] for j in fc2: fc3.append(ph48239(id2ph(j))) fc4.append(ph2c(ph48239(id2ph(j)))) seq.append(fc3) seq2.append(''.join(trim(fc4))) seq_flat = np.concatenate(seq) with open('hw1_outz.txt', 'w') as f: f.write('id,prediction\n') for t, i in zip(tags, seq_flat): f.write(t+','+i+'\n') with open('hw2_outz.txt', 'w') as f: f.write('id,phone_sequence\n') for n, i in zip(names, seq2): f.write(n+','+i+'\n')
def train_net(net, train_stream, test_stream, L1 = None, L2=None, early_stopping=False, finish=None, dropout=False, jobid=None, update=None, duration= None, **ignored): x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') y_hat = net.apply(x) #Cost cost_before = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost_before.name = "cost_without_regularization" #Error #Taken from brodesf error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = "Misclassification rate" #Regularization cg = ComputationGraph(cost_before) WS = VariableFilter(roles=[WEIGHT])(cg.variables) if dropout: print("Dropout") cg = apply_dropout(cg, WS, 0.5) if L1: print("L1 with lambda ",L1) L1_reg = L1 * sum([abs(W).sum() for W in WS]) L1_reg.name = "L1 regularization" cost_before += L1_reg if L2: print("L2 with lambda ",L2) L2_reg = L2 * sum([(W ** 2).sum() for W in WS]) L2_reg.name = "L2 regularization" cost_before += L2_reg cost = cost_before cost.name = 'cost_with_regularization' #Initialization print("Initilization") net.initialize() #Algorithm step_rule = Scale(learning_rate=0.1) if update is not None: if update == "rmsprop": print("Using RMSProp") step_rule = RMSProp() remove_not_finite = RemoveNotFinite(0.9) step_rule = CompositeRule([step_rule, remove_not_finite]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) print("Extensions") extensions = [] #Monitoring monitor = DataStreamMonitoring(variables=[cost, error], data_stream=test_stream, prefix="test") extensions.append(monitor) def filename(suffix=""): prefix = jobid if jobid else str(os.getpid()) ctime = str(time.time()) return "checkpoints/" + prefix + "_" + ctime + "_" + suffix + ".zip" #Serialization #serialization = Checkpoint(filename()) #extensions.append(serialization) notification = "test_"+error.name track = TrackTheBest(notification) best_notification = track.notification_name checkpointbest = SaveBest(best_notification, filename("best")) extensions.extend([track, checkpointbest]) if early_stopping: print("Early stopping") stopper = FinishIfNoImprovementAfterPlus(best_notification) extensions.append(stopper) #Other extensions if finish != None: print("Force finish ", finish) extensions.append(FinishAfter(after_n_epochs=finish)) if duration != None: print("Stop after " , duration, " seconds") extensions.append(FinishAfterTime(duration)) extensions.extend([ Timing(), Printing() ]) #Main loop main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) print("Main loop start") main_loop.run()
Convolutional(filter_size=(1, 1), num_filters=1024, name='Convx3'), Rectifier(), MaxPooling((2, 2), name='MaxPol2'), Convolutional(filter_size=(1, 1), num_filters=2, name='Convx4'), Rectifier(), ]) conv_sequence1 = ConvolutionalSequence(conv_layers1, num_channels=512, image_size=(10, 10), weights_init=Orthogonal(), use_bias=False, name='ConvSeq3') conv_sequence1.initialize() out_soft1 = Flattener(name='Flatt1').apply(conv_sequence1.apply(out5)) predict1 = NDimensionalSoftmax(name='Soft1').apply(out_soft1) cost1 = CategoricalCrossEntropy(name='Cross1').apply( y.flatten(), predict1).copy(name='cost1') #SECOND SOFTMAX conv_layers2 = list([ MaxPooling((2, 2), name='MaxPol2'), Convolutional(filter_size=(1, 1), num_filters=128, name='Convx21'), Rectifier(), MaxPooling((2, 2), name='MaxPol11'), Convolutional(filter_size=(1, 1), num_filters=1024, name='Convx31'), Rectifier(), MaxPooling((2, 2), name='MaxPol21'), Convolutional(filter_size=(1, 1), num_filters=2, name='Convx41'), Rectifier(), ]) conv_sequence2 = ConvolutionalSequence(conv_layers2, num_channels=832,
def main(): feature_maps = [20, 50] mlp_hiddens = [50] conv_sizes = [5, 5] pool_sizes = [3, 3] save_to = "DvC.pkl" batch_size = 500 image_size = (32, 32) output_size = 2 learningRate = 0.1 num_epochs = 10 num_batches = None host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['valid_cost', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
Xs = tensor.imatrix("context") y = tensor.ivector('center') w1 = LookupTable(name="w1", length=VOCAB_DIM, dim=EMBEDDING_DIM) w2 = Linear(name='w2', input_dim=EMBEDDING_DIM, output_dim=VOCAB_DIM) hidden = tensor.mean(w1.apply(Xs), axis=1) y_hat = Softmax().apply(w2.apply(hidden)) w1.weights_init = w2.weights_init = IsotropicGaussian(0.01) w1.biases_init = w2.biases_init = Constant(0) w1.initialize() w2.initialize() cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = "loss" # # the actual training of the model # main = MainLoop(data_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme(dataset.num_instances, batch_size=512)), algorithm = GradientDescent(
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500): if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): logging.info("Layer {} dim: {} {} {}".format(i, *layer.get_dim('output'))) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = named_copy(CategoricalCrossEntropy().apply(y.flatten(), probs), 'cost') error_rate = named_copy(MisclassificationRate().apply(y.flatten(), probs), 'error_rate') cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] model = Model(cost) main_loop = MainLoop(algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run()
def start(self): x = T.matrix('features', config.floatX) y = T.imatrix('targets') self.x = x DIMS = [108*5, 1000, 1000, 1000, 1000, 1943] NUMS = [1, 1, 1, 1, 1, 1] FUNCS = [ Rectifier, Rectifier, Rectifier, Rectifier, # Rectifier, # Maxout(num_pieces=5), # Maxout(num_pieces=5), # Maxout(num_pieces=5), # SimpleRecurrent, # SimpleRecurrent, # SimpleRecurrent, Softmax, ] def lllistool(i, inp, func): l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], weights_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), biases_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), name='Lin{}'.format(i)) l.initialize() func.name='Fun{}'.format(i) if func == SimpleRecurrent: gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=(DIMS[i]+DIMS[i+1])**(-0.5))) else: gong = func() ret = gong.apply(l.apply(inp)) return ret oup = x for i in range(len(DIMS)-1): oup = lllistool(i, oup, FUNCS[i]) y_hat = oup self.y_hat_prob = y_hat cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat).astype(config.floatX) cg = ComputationGraph(cost) orig_cg = cg ips = VariableFilter(roles=[INPUT])(cg.variables) ops = VariableFilter(roles=[OUTPUT])(cg.variables) cg = apply_dropout(cg, ips[0:2:1], 0.2) cg = apply_dropout(cg, ips[2:-2:1], 0.5) cost = cg.outputs[0] cost.name = 'cost' # mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)])) mps = theano.shared(np.array([ph2id(state239(t)) for t in range(1943)])) z_hat = T.argmax(y_hat, axis=1) y39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[y.flatten()]) y_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat]) self.y_hat39 = y_hat39 lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) lost01.name = '0/1 loss' lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) #lost23 = MisclassificationRate().apply(y39, y_hat39).astype(config.floatX) lost23.name = '2/3 loss' Ws = VariableFilter(roles=[WEIGHT])(cg.variables) norms = sum(w.norm(2) for w in Ws) norms.name = 'norms' path = pjoin(PATH['fuel'], pfx+'_train.hdf5') data = H5PYDataset(path, which_set='train', load_in_memory=True, subset=slice(0, 100000)) # data = H5PYDataset(path, which_set='train', load_in_memory=True) data_v = H5PYDataset(pjoin(PATH['fuel'], pfx+'_validate.hdf5'), which_set='validate', load_in_memory=True) num = data.num_examples data_stream = DataStream(data, iteration_scheme=ShuffledScheme( num, batch_size=128)) data_stream_v = DataStream(data_v, iteration_scheme=SequentialScheme( data_v.num_examples, batch_size=128)) algo = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([Momentum(0.002, 0.9)])) monitor = DataStreamMonitoring( variables=[cost, lost01, norms], data_stream=data_stream) monitor_v = DataStreamMonitoring( variables=[lost23], data_stream=data_stream_v) plt = Plot('AlpAlpAlp', channels=[['0/1 loss', '2/3 loss']], after_epoch=True) main_loop = MainLoop(data_stream = data_stream, algorithm=algo, extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=2000), Printing(), plt]) main_loop.run()
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError( 'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)' ) z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, ) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 4, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d' % l, ogates_zoneout=ogates_zoneout) for l in range(num_layers) ] elif rnn_type.lower() == 'gru': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 3, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d' % l) for l in range(num_layers) ] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d' % l) for l in range(num_layers) ] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply( rnn_embedding, zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost / np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states' % l in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states' % l: norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x * x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [ cost_train, cost, bpc, perp, learning_rate, aggregation.mean( algorithm.total_gradient_norm).copy("gradient_norm_mean") ] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append( RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value() / lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError( 'Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
########## hyper parameters########################################### # We push initialization config to set different initialization schemes convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=0.2) convnet.layers[1].weights_init = Uniform(width=0.2) convnet.layers[2].weights_init = Uniform(width=0.2) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=0.2) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=0.2) convnet.initialize() ######################################################### #Generate output and error signal predict = convnet.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) ########### ALGORITHM of training############# algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.0005)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid"),
def main(feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=None, num_batches=None): if feature_maps is None: feature_maps = [32, 48, 64, 96, 96, 128] if mlp_hiddens is None: mlp_hiddens = [1000] if conv_sizes is None: conv_sizes = [9, 7, 5, 3, 2, 1] if pool_sizes is None: pool_sizes = [2, 2, 2, 2, 1, 1] if batch_size is None: batch_size = 64 conv_steps=[2, 1, 1, 1, 1, 1] #same as stride image_size = (128, 128) output_size = 2 learningRate = 0.001 drop_prob = 0.4 weight_noise = 0.75 num_epochs = 150 num_batches = None host_plot='http://*****:*****@ %s' % (graph_name, datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) PLOT_AVAILABLE = True except ImportError: PLOT_AVAILABLE = False extensions.append(Checkpoint(save_to, after_epoch=True, after_training=True, save_separately=['log'])) logger.info("Building the model") model = Model(cost) ########### Loading images ##################### main_loop = MainLoop( algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(save_to, num_epochs, regularization=0.0003, subset=None, num_batches=None, histogram=None, resume=False): batch_size = 500 output_size = 10 convnet = create_lenet_5() layers = convnet.layers x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) components = (ComponentwiseCrossEntropy().apply(y.flatten(), probs) .copy(name='components')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs) .copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) cg = ComputationGraph([cost, error_rate, components]) # Apply regularization to the cost weights = VariableFilter(roles=[WEIGHT])(cg.variables) l2_norm = sum([(W ** 2).sum() for W in weights]) l2_norm.name = 'l2_norm' cost = cost + regularization * l2_norm cost.name = 'cost_with_regularization' if subset: start = 30000 - subset // 2 mnist_train = MNIST(("train",), subset=slice(start, start+subset)) else: mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=AdaDelta(decay_rate=0.99)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [cost, error_rate, confusion], mnist_test_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, l2_norm, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(cost) main_loop = MainLoop( algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') #attention ---> patch_shape = (16, 16); image_shape = (784,100); import numpy import theano.tensor as T n_spatial_dims = 2 cropper = SoftRectangularCropper(n_spatial_dims=n_spatial_dims, patch_shape=patch_shape, image_shape=image_shape, kernel=Gaussian()) batch_size = 10 scales = 1.3**numpy.arange(-7, 6) n_patches = len(scales) locations = (numpy.ones((n_patches, batch_size, 2)) * image_shape/2).astype(numpy.float32) scales = numpy.tile(scales[:, numpy.newaxis, numpy.newaxis], (1, batch_size, 2)).astype(numpy.float32) Tpatches = T.stack(*[cropper.apply(x, T.constant(location), T.constant(scale))[0] for location, scale in zip(locations, scales)]) patches = theano.function([x], Tpatches)(batch['features']) import ipdb as pdb; pdb.set_trace() probs = mlp.apply(tensor.flatten(patches, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run()
def main(save_to, num_epochs, flag, ksize): batch_size = 128 dim = 100 n_steps = 20 i2h1 = MLP([Identity()], [784, dim], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) h2o1 = MLP([Rectifier(), Softmax()], [dim, dim, 10], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) rec1 = SimpleRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) i2h1.initialize() h2o1.initialize() rec1.initialize() x = tensor.tensor3('features') y = tensor.lmatrix('targets') preproc = i2h1.apply(x) h1 = rec1.apply(preproc) probs = tensor.flatten(h2o1.apply(h1[-1],), outdim=2) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cost.name = 'final_cost' error_rate.name = 'error_rate' cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST("train", subset=slice(0, 50000)) mnist_valid = MNIST("train", subset=slice(50000, 60000)) mnist_test = MNIST("test") trainstream = Mapping(Flatten(DataStream(mnist_train, iteration_scheme=SequentialScheme(50000, batch_size))), _meanize(n_steps, flag, ksize)) validstream = Mapping(Flatten(DataStream(mnist_valid, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps, flag, ksize)) teststream = Mapping(Flatten(DataStream(mnist_test, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps, flag, ksize)) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([Adam(), StepClipping(100)])) main_loop = MainLoop( algorithm, trainstream, extensions=[Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], validstream, prefix="test"), DataStreamMonitoringAndSaving( [cost, error_rate], teststream, [i2h1, h2o1, rec1], 'best_'+save_to+'.pkl', cost_name=error_rate.name, after_epoch=True, prefix='valid' ), TrainingDataMonitoring( [cost, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), # Plot( # save_to, # channels=[ # ['test_final_cost', # 'test_misclassificationrate_apply_error_rate'], # ['train_total_gradient_norm']]), Printing()]) main_loop.run()
def apply(self, input_labeled, target_labeled, input_unlabeled): self.layer_counter = 0 input_dim = self.p.encoder_layers[0] # Store the dimension tuples in the same order as layers. layers = self.layers self.layer_dims = {0: input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) self.costs = costs = AttributeDict() self.costs.denois = AttributeDict() self.act = AttributeDict() self.error = AttributeDict() top = len(layers) - 1 N = input_labeled.shape[0] self.join = lambda l, u: T.concatenate([l, u], axis=0) self.labeled = lambda x: x[:N] if x is not None else x self.unlabeled = lambda x: x[N:] if x is not None else x self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) input_concat = self.join(input_labeled, input_unlabeled) def encoder(input_, path_name, input_noise_std=0, noise_std=[]): h = input_ logger.info(' 0: noise %g' % input_noise_std) if input_noise_std > 0.: h = h + self.noise_like(h) * input_noise_std d = AttributeDict() d.unlabeled = self.new_activation_dict() d.labeled = self.new_activation_dict() d.labeled.z[0] = self.labeled(h) d.unlabeled.z[0] = self.unlabeled(h) prev_dim = input_dim for i, (spec, _, act_f) in layers[1:]: d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h) noise = noise_std[i] if i < len(noise_std) else 0. curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f, path_name=path_name, noise_std=noise) assert self.layer_dims.get(i) in (None, curr_dim) self.layer_dims[i] = curr_dim d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z) d.unlabeled.s[i] = s d.unlabeled.m[i] = m prev_dim = curr_dim d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h) return d # Clean, supervised logger.info('Encoder: clean, labeled') clean = self.act.clean = encoder(input_concat, 'clean') # Corrupted, supervised logger.info('Encoder: corr, labeled') corr = self.act.corr = encoder(input_concat, 'corr', input_noise_std=self.p.super_noise_std, noise_std=self.p.f_local_noise_std) est = self.act.est = self.new_activation_dict() # Decoder path in opposite order logger.info('Decoder: z_corr -> z_est') for i, ((_, spec), l_type, act_f) in layers[::-1]: z_corr = corr.unlabeled.z[i] z_clean = clean.unlabeled.z[i] z_clean_s = clean.unlabeled.s.get(i) z_clean_m = clean.unlabeled.m.get(i) fspec = layers[i + 1][1][0] if len(layers) > i + 1 else (None, None) if i == top: ver = corr.unlabeled.h[i] ver_dim = self.layer_dims[i] top_g = True else: ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], l_type=l_type, num=i, fspec=fspec, top_g=top_g) if z_est is not None: # Denoising cost if z_clean_s: z_est_norm = (z_est - z_clean_m) / z_clean_s else: z_est_norm = z_est se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) denois_print = 'denois %.2f' % self.p.denoising_cost_x[i] else: denois_print = '' # Store references for later use est.h[i] = self.apply_act(z_est, act_f) est.z[i] = z_est est.s[i] = None est.m[i] = None logger.info(' g%d: %10s, %s, dim %s -> %s' % (i, l_type, denois_print, self.layer_dims.get(i + 1), self.layer_dims.get(i))) # Costs y = target_labeled.flatten() costs.class_clean = CategoricalCrossEntropy().apply( y, clean.labeled.h[top]) costs.class_clean.name = 'cost_class_clean' costs.class_corr = CategoricalCrossEntropy().apply( y, corr.labeled.h[top]) costs.class_corr.name = 'cost_class_corr' # This will be used for training costs.total = costs.class_corr * 1.0 for i in range(top + 1): if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0: costs.total += costs.denois[i] * self.p.denoising_cost_x[i] costs.total.name = 'cost_total' # Classification error mr = MisclassificationRate() self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.error.clean.name = 'error_rate_clean'
probs = Softmax().apply(n5) statistics_list=[(M1,S1,a1), (M2,S2,a2), (M3,S3,a3), (M4,S4,a4), (M5,S5,a5)] # initialize_variables # for variable (M,S) in variables: # compute M and S in the whole data. if normalization == 'bn2': for m,s,var in statistics_list: var.tag.aggregation_scheme = MeanAndVariance(var, var.shape[0], axis = 0) init_mn, init_var = DatasetEvaluator([var]).evaluate(stream_train)[var.name] m.set_value(init_mn.astype(floatX)) s.set_value(sqrt(init_var).astype(floatX)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) cost.name = 'cost' error_rate = MisclassificationRate().apply(y.flatten(), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) parameters = cg.parameters # add gradient descent to M,S if normalization == 'bn2': for m,s,var in statistics_list: parameters.extend([m,s]) algorithm = GradientDescent( cost=cost, parameters=parameters, step_rule=Adam(0.01))
def create_network(inputs=None, batch=batch_size): if inputs is None: inputs = T.tensor4('features') x = T.cast(inputs,'float32') x = x / 255. if dataset != 'binarized_mnist' else x # GatedPixelCNN gated = GatedPixelCNN( name='gated_layer_0', filter_size=7, image_size=(img_dim,img_dim), num_filters=h*n_channel, num_channels=n_channel, batch_size=batch, weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), res=False ) gated.initialize() x_v, x_h = gated.apply(x, x) for i in range(n_layer): gated = GatedPixelCNN( name='gated_layer_{}'.format(i+1), filter_size=3, image_size=(img_dim,img_dim), num_channels=h*n_channel, batch_size=batch, weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), res=True ) gated.initialize() x_v, x_h = gated.apply(x_v, x_h) conv_list = [] conv_list.extend([Rectifier(), ConvolutionalNoFlip((1,1), h*n_channel, mask_type='B', name='1x1_conv_1')]) #conv_list.extend([Rectifier(), ConvolutionalNoFlip((1,1), h*n_channel, mask='B', name='1x1_conv_2')]) conv_list.extend([Rectifier(), ConvolutionalNoFlip(*third_layer, mask_type='B', name='output_layer')]) sequence = ConvolutionalSequence( conv_list, num_channels=h*n_channel, batch_size=batch, image_size=(img_dim,img_dim), border_mode='half', weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), tied_biases=False ) sequence.initialize() x = sequence.apply(x_h) if MODE == '256ary': x = x.reshape((-1, 256, n_channel, img_dim, img_dim)).dimshuffle(0,2,3,4,1) x = x.reshape((-1, 256)) x_hat = Softmax().apply(x) inp = T.cast(inputs, 'int64').flatten() cost = CategoricalCrossEntropy().apply(inp, x_hat) * img_dim * img_dim cost_bits_dim = categorical_crossentropy(log_softmax(x), inp) else: x_hat = Logistic().apply(x) cost = BinaryCrossEntropy().apply(inputs, x_hat) * img_dim * img_dim #cost = T.nnet.binary_crossentropy(x_hat, inputs) #cost = cost.sum() / inputs.shape[0] cost_bits_dim = -(inputs * T.log2(x_hat) + (1.0 - inputs) * T.log2(1.0 - x_hat)).mean() cost_bits_dim.name = "nnl_bits_dim" cost.name = 'loglikelihood_nat' return cost, cost_bits_dim