def apply(self, input_lb, input_un, target): batch_size = input_lb.shape[0] get_labeled = lambda x: x[:batch_size] if x is not None else x input = T.concatenate([input_lb, input_un], axis=0) self.layer_dims = {0: self.input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) top = len(self.layers) - 1 clean = self.encoder(input, noise_std=[0]) corr = self.encoder(input, noise_std=self.noise_std) ests, costs = self.decoder(clean, corr, batch_size) # Costs y = target.flatten() costs.class_clean = CategoricalCrossEntropy().apply( y, get_labeled(clean.h[top])) costs.class_clean.name = 'CE_clean' costs.class_corr = CategoricalCrossEntropy().apply( y, get_labeled(corr.h[top])) costs.class_corr.name = 'CE_corr' costs.total = costs.class_corr * 1.0 for i in range(len(self.layers)): costs.total += costs.denois[i] * self.denoising_cost_x[i] costs.total.name = 'Total_cost' self.costs = costs # Classification error mr = MisclassificationRate() self.error = mr.apply(y, get_labeled(clean.h[top])) * np.float32(100.) self.error.name = 'Error_rate'
def __init__(self, config): self.X = T.tensor4("features") c = config seq = BrickSequence( input_dim=(3, 32, 32), bricks=[ conv3(c['n_l1']), conv3(c['n_l2']), max_pool(), conv3(c['n_l3']), conv3(c['n_l4']), max_pool(), #conv3(10), #conv3(10), Flattener(), linear(c['n_l5']), Softmax() ]) seq.initialize() self.pred = seq.apply(self.X) self.Y = T.imatrix("targets") self.cost = CategoricalCrossEntropy().apply(self.Y.flatten(), self.pred) self.cost.name = "cost" self.accur = 1.0 - MisclassificationRate().apply( self.Y.flatten(), self.pred) self.accur.name = "accur"
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
def build_model(images, labels): # Construct a bottom convolutional sequence bottom_conv_sequence = convolutional_sequence((3,3), 16, (160, 160)) bottom_conv_sequence._push_allocation_config() # Flatten layer flattener = Flattener() # Construct a top MLP conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output')) #top_mlp = MLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) top_mlp = BatchNormalizedMLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) # Construct feedforward sequence ss_seq = FeedforwardSequence([bottom_conv_sequence.apply, flattener.apply, top_mlp.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost_noreg = CategoricalCrossEntropy().apply(labels.flatten(), prediction) # add regularization selector = Selector([top_mlp]) Ws = selector.get_parameters('W') mlp_brick_name = 'batchnormalizedmlp' W0 = Ws['/%s/linear_0.W' % mlp_brick_name] W1 = Ws['/%s/linear_1.W' % mlp_brick_name] cost = cost_noreg + .01 * (W0 ** 2).mean() + .01 * (W1 ** 2).mean() return cost
def build_model(images, labels): # Construct a bottom convolutional sequence bottom_conv_sequence = convolutional_sequence((3, 3), 64, (150, 150)) bottom_conv_sequence._push_allocation_config() # Flatten layer flattener = Flattener() # Construct a top MLP conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output')) top_mlp = MLP([ LeakyRectifier(name='non_linear_9'), LeakyRectifier(name='non_linear_10'), Softmax(name='non_linear_11') ], [conv_out_dim, 2048, 612, 10], weights_init=IsotropicGaussian(), biases_init=Constant(1)) # Construct feedforward sequence ss_seq = FeedforwardSequence( [bottom_conv_sequence.apply, flattener.apply, top_mlp.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost = CategoricalCrossEntropy().apply(labels.flatten(), prediction) return cost
def setup_model(): # shape: T x B x F input_ = T.tensor3('features') # shape: B target = T.lvector('targets') model = LSTMAttention(input_dim=10000, dim=500, mlp_hidden_dims=[2000, 500, 4], batch_size=100, image_shape=(100, 100), patch_shape=(28, 28), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) model.initialize() h, c = model.apply(input_) classifier = MLP([Rectifier(), Softmax()], [500, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) error_rate = MisclassificationRate().apply(target, probabilities) return cost, error_rate
def __init__(self, split_idx, domain_classifier, **kwargs): super(DomainUnsupervisedCost, self).__init__(**kwargs) batchwise_split = BatchwiseSplit(split_idx) gradient_reversal = GradientReversal() cost = CategoricalCrossEntropy() self.children = [ gradient_reversal, domain_classifier, batchwise_split, cost ]
def apply(self, input_, target): mlp = MLP(self.non_lins, self.dims, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name=self.name) mlp.initialize() probs = mlp.apply(T.flatten(input_, outdim=2)) probs.name = 'probs' cost = CategoricalCrossEntropy().apply(target.flatten(), probs) cost.name = "CE" self.outputs = {} self.outputs['probs'] = probs self.outputs['cost'] = cost
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHTS])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent(cost=cost, step_rule=SteepestDescent(learning_rate=0.1)) main_loop = MainLoop( mlp, DataStream(mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 50)), algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_every_epoch=True), SerializeMainLoop(save_to), Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']]), Printing() ]) main_loop.run()
def main(save_to, num_epochs, batch_size): mlp = MLP([Tanh(), Tanh(), Tanh(), Softmax()], [3072, 4096, 1024, 512, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tt.tensor4('features', dtype='float32') y = tt.vector('label', dtype='int32') probs = mlp.apply(x.reshape((-1, 3072))) cost = CategoricalCrossEntropy().apply(y, probs) error_rate = MisclassificationRate().apply(y, probs) cg = ComputationGraph([cost]) ws = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * sum(([(w**2).sum() for w in ws])) cost.name = 'final_cost' train_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=True) valid_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=False) train_stream = train_dataset.get_stream(batch_size) valid_stream = valid_dataset.get_stream(batch_size) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) extensions = [ Timing(), LogExtension('/home/belohlavek/ALI/mlp.log'), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], valid_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] main_loop = MainLoop(algorithm, train_stream, model=Model(cost), extensions=extensions) main_loop.run()
def setup_model(): # shape: T x B x F input_ = T.tensor3('features') # shape: B target = T.lvector('targets') model = LSTMAttention(dim=256, mlp_hidden_dims=[256, 4], batch_size=100, image_shape=(64, 64), patch_shape=(16, 16), weights_init=Glorot(), biases_init=Constant(0)) model.initialize() h, c, location, scale = model.apply(input_) classifier = MLP([Rectifier(), Softmax()], [256 * 2, 200, 10], weights_init=Glorot(), biases_init=Constant(0)) model.h = h model.c = c model.location = location model.scale = scale classifier.initialize() probabilities = classifier.apply(T.concatenate([h[-1], c[-1]], axis=1)) cost = CategoricalCrossEntropy().apply(target, probabilities) error_rate = MisclassificationRate().apply(target, probabilities) model.cost = cost location_x_0_avg = T.mean(location[0, :, 0]) location_x_0_avg.name = 'location_x_0_avg' location_x_10_avg = T.mean(location[10, :, 0]) location_x_10_avg.name = 'location_x_10_avg' location_x_20_avg = T.mean(location[-1, :, 0]) location_x_20_avg.name = 'location_x_20_avg' scale_x_0_avg = T.mean(scale[0, :, 0]) scale_x_0_avg.name = 'scale_x_0_avg' scale_x_10_avg = T.mean(scale[10, :, 0]) scale_x_10_avg.name = 'scale_x_10_avg' scale_x_20_avg = T.mean(scale[-1, :, 0]) scale_x_20_avg.name = 'scale_x_20_avg' monitorings = [ error_rate, location_x_0_avg, location_x_10_avg, location_x_20_avg, scale_x_0_avg, scale_x_10_avg, scale_x_20_avg ] model.monitorings = monitorings return model
def test_dataset_evaluators(): X = theano.tensor.vector('X') Y = theano.tensor.vector('Y') data = [ numpy.arange(1, 7, dtype=theano.config.floatX).reshape(3, 2), numpy.arange(11, 17, dtype=theano.config.floatX).reshape(3, 2) ] data_stream = IterableDataset(dict(X=data[0], Y=data[1])).get_example_stream() validator = DatasetEvaluator([ CrossEntropy(requires=[X, Y], name="monitored_cross_entropy0"), # to test two same quantities and make sure that state will be reset CrossEntropy(requires=[X, Y], name="monitored_cross_entropy1"), CategoricalCrossEntropy().apply(X, Y), ]) values = validator.evaluate(data_stream) numpy.testing.assert_allclose(values['monitored_cross_entropy1'], values['categoricalcrossentropy_apply_cost'])
def test_softmax_vector(): x = tensor.matrix('x') y = tensor.lvector('y') softmax_out = Softmax().apply(x) cost = CategoricalCrossEntropy().apply(y, softmax_out) cost_stable = Softmax().categorical_cross_entropy(y, x) softmax_cost_func = function([x, y], cost) softmax_cost_stable_func = function([x, y], cost_stable) batch_size = 100 x_size = 10 rng = numpy.random.RandomState(1) x_val = rng.randn(batch_size, x_size).astype(theano.config.floatX) y_val = rng.randint(low=0, high=x_size, size=(batch_size)) softmax_cost = softmax_cost_func(x_val, y_val) softmax_cost_stable = softmax_cost_stable_func(x_val, y_val) assert_allclose(softmax_cost, softmax_cost_stable)
def test_softmax_matrix(): x = tensor.matrix('x') y = tensor.matrix('y') softmax_out = Softmax().apply(x) cost = CategoricalCrossEntropy().apply(y, softmax_out) cost_stable = Softmax().categorical_cross_entropy(y, x) softmax_cost_func = function([x, y], cost) softmax_cost_stable_func = function([x, y], cost_stable) batch_size = 2 x_size = 2 rng = numpy.random.RandomState(1) x_val = rng.randn(batch_size, x_size).astype(theano.config.floatX) y_val_us = rng.uniform(size=(batch_size, x_size)).astype(theano.config.floatX) y_val = y_val_us / numpy.expand_dims(y_val_us.sum(axis=1), axis=1) softmax_cost = softmax_cost_func(x_val, y_val) softmax_cost_stable = softmax_cost_stable_func(x_val, y_val) assert_allclose(softmax_cost, softmax_cost_stable, rtol=1e-5)
probs = Softmax().apply(n5) statistics_list=[(M1,S1,a1), (M2,S2,a2), (M3,S3,a3), (M4,S4,a4), (M5,S5,a5)] # initialize_variables # for variable (M,S) in variables: # compute M and S in the whole data. if normalization == 'bn2': for m,s,var in statistics_list: var.tag.aggregation_scheme = MeanAndVariance(var, var.shape[0], axis = 0) init_mn, init_var = DatasetEvaluator([var]).evaluate(stream_train)[var.name] m.set_value(init_mn.astype(floatX)) s.set_value(sqrt(init_var).astype(floatX)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) cost.name = 'cost' error_rate = MisclassificationRate().apply(y.flatten(), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) parameters = cg.parameters # add gradient descent to M,S if normalization == 'bn2': for m,s,var in statistics_list: parameters.extend([m,s]) algorithm = GradientDescent( cost=cost, parameters=parameters, step_rule=Adam(0.01))
from fuel.streams import DataStream from fuel.schemes import SequentialScheme from fuel.transformers import Flatten # Construct the model mlp = MLP(activations=[Tanh(), Softmax()], dims=[784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() # Calculate the loss function x = T.matrix('features') y = T.lmatrix('targets') y_hat = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error_rate = MisclassificationRate().apply(y.flatten(), y_hat) # load training data using Fuel mnist_train = MNIST("train") train_stream = Flatten( DataStream.default_stream(dataset=mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 128)), ) # load testing data mnist_test = MNIST("test") test_stream = Flatten( DataStream.default_stream(dataset=mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 1024)), )
def main(): feature_maps = [20, 50] mlp_hiddens = [50] conv_sizes = [5, 5] pool_sizes = [3, 3] save_to = "DvC.pkl" batch_size = 500 image_size = (32, 32) output_size = 2 learningRate = 0.1 num_epochs = 10 num_batches = None host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['valid_cost', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500): if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): logging.info("Layer {} dim: {} {} {}".format(i, *layer.get_dim('output'))) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = named_copy(CategoricalCrossEntropy().apply(y.flatten(), probs), 'cost') error_rate = named_copy(MisclassificationRate().apply(y.flatten(), probs), 'error_rate') cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] model = Model(cost) main_loop = MainLoop(algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run()
def main(save_to, num_epochs, regularization=0.0003, subset=None, num_batches=None, histogram=None, resume=False): batch_size = 500 output_size = 10 convnet = create_lenet_5() layers = convnet.layers x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) components = (ComponentwiseCrossEntropy().apply(y.flatten(), probs) .copy(name='components')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs) .copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) cg = ComputationGraph([cost, error_rate, components]) # Apply regularization to the cost weights = VariableFilter(roles=[WEIGHT])(cg.variables) l2_norm = sum([(W ** 2).sum() for W in weights]) l2_norm.name = 'l2_norm' cost = cost + regularization * l2_norm cost.name = 'cost_with_regularization' if subset: start = 30000 - subset // 2 mnist_train = MNIST(("train",), subset=slice(start, start+subset)) else: mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=AdaDelta(decay_rate=0.99)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [cost, error_rate, confusion], mnist_test_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, l2_norm, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(cost) main_loop = MainLoop( algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def create_model(self, symbols_num = 500): # Hyperparameters # The dimension of the hidden state of the GRUs in each direction. hidden_states = self.args.encoder_hidden_dims # Dimension of the word-embedding space embedding_dims = self.args.source_embeddings_dim ################### # Declaration of the Theano variables that come from the data stream ################### # The context document. context_bt = tt.lmatrix('context') # Context document mask used to distinguish real symbols from the sequence and padding symbols that are at the end context_mask_bt = tt.matrix('context_mask') # The question question_bt = tt.lmatrix('question') question_mask_bt = tt.matrix('question_mask') # The correct answer y = tt.lmatrix('answer') y = y[:,0] # originally answers are in a 2d matrix, here we convert it to a vector # The candidates among which the answer is selected candidates_bi = tt.lmatrix("candidates") candidates_bi_mask = tt.matrix("candidates_mask") ################### # Network's components ################### # Lookup table with randomly initialized word embeddings lookup = LookupTable(symbols_num, embedding_dims, weights_init=Uniform(width=0.2)) # bidirectional encoder that translates context context_encoder = self.create_bidi_encoder("context_encoder", embedding_dims, hidden_states) # bidirectional encoder for question question_encoder = self.create_bidi_encoder("question_encoder", embedding_dims, hidden_states) # Initialize the components (where not done upon creation) lookup.initialize() ################### # Wiring the components together # # Where present, the 3 letters at the end of the variable name identify its dimensions: # b ... position of the example within the batch # t ... position of the word within the document/question # f ... features of the embedding vector ################### ### Read the context document # Map token indices to word embeddings context_embedding_tbf = lookup.apply(context_bt.T) # Read the embedded context document using the bidirectional GRU and produce the contextual embedding of each word memory_encoded_btf = context_encoder.apply(context_embedding_tbf, context_mask_bt.T).dimshuffle(1,0,2) memory_encoded_btf.name = "memory_encoded_btf" ### Correspondingly, read the query x_embedded_tbf = lookup.apply(question_bt.T) x_encoded_btf = question_encoder.apply(x_embedded_tbf, question_mask_bt.T).dimshuffle(1,0,2) # The query encoding is a concatenation of the final states of the forward and backward GRU encoder x_forward_encoded_bf = x_encoded_btf[:,-1,0:hidden_states] x_backward_encoded_bf = x_encoded_btf[:,0,hidden_states:hidden_states*2] query_representation_bf = tt.concatenate([x_forward_encoded_bf,x_backward_encoded_bf],axis=1) # Compute the attention on each word in the context as a dot product of its contextual embedding and the query mem_attention_presoft_bt = tt.batched_dot(query_representation_bf, memory_encoded_btf.dimshuffle(0,2,1)) # TODO is this pre-masking necessary? mem_attention_presoft_masked_bt = tt.mul(mem_attention_presoft_bt,context_mask_bt) # Normalize the attention using softmax mem_attention_bt = SoftmaxWithMask(name="memory_query_softmax").apply(mem_attention_presoft_masked_bt,context_mask_bt) if self.args.weighted_att: # compute weighted attention over original word vectors att_weighted_responses_bf = theano.tensor.batched_dot(mem_attention_bt, context_embedding_tbf.dimshuffle(1,0,2)) # compare desired response to all candidate responses # select relevant candidate answer words candidates_embeddings_bfi = lookup.apply(candidates_bi).dimshuffle(0,2,1) # convert it to output symbol probabilities y_hat_presoft = tt.batched_dot(att_weighted_responses_bf, candidates_embeddings_bfi) y_hat = SoftmaxWithMask(name="output_softmax").apply(y_hat_presoft,candidates_bi_mask) else: # Sum the attention of each candidate word across the whole context document, # this is the key innovation of the model # TODO: Get rid of sentence-by-sentence processing? # TODO: Rewrite into matrix notation instead of scans? def sum_prob_of_word(word_ix, sentence_ixs, sentence_attention_probs): word_ixs_in_sentence = tt.eq(sentence_ixs,word_ix).nonzero()[0] return sentence_attention_probs[word_ixs_in_sentence].sum() def sum_probs_single_sentence(candidate_indices_i, sentence_ixs_t, sentence_attention_probs_t): result, updates = theano.scan( fn=sum_prob_of_word, sequences=[candidate_indices_i], non_sequences=[sentence_ixs_t, sentence_attention_probs_t]) return result def sum_probs_batch(candidate_indices_bt,sentence_ixs_bt, sentence_attention_probs_bt): result, updates = theano.scan( fn=sum_probs_single_sentence, sequences=[candidate_indices_bt, sentence_ixs_bt, sentence_attention_probs_bt], non_sequences=None) return result # Sum the attention of each candidate word across the whole context document y_hat = sum_probs_batch(candidates_bi, context_bt, mem_attention_bt) y_hat.name = "y_hat" # We use the convention that ground truth is always at index 0, so the following are the target answers y = y.zeros_like() # We use Cross Entropy as the training objective cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost.name = "cost" predicted_response_index = tt.argmax(y_hat,axis=1) accuracy = tt.eq(y,predicted_response_index).mean() accuracy.name = "accuracy" return cost, accuracy, mem_attention_bt, y_hat, context_bt, candidates_bi, candidates_bi_mask, y, context_mask_bt, question_bt, question_mask_bt
########## hyper parameters########################################### # We push initialization config to set different initialization schemes convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=0.2) convnet.layers[1].weights_init = Uniform(width=0.2) convnet.layers[2].weights_init = Uniform(width=0.2) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=0.2) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=0.2) convnet.initialize() ######################################################### #Generate output and error signal predict = convnet.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) ########### ALGORITHM of training############# algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.0005)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid"),
def main(save_to, num_epochs, weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 prior_noise_level = -10 noise_step_rule = Scale(1e-6) noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX)) convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True, noise_rate=noise_rate, prior_noise_level=prior_noise_level) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): with training_noise(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # for annealing nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' # Compute noise rates for training graph train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean() train_mean_log_sigma.name = 'mean_log_sigma' train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean() train_nit_rate.name = 'nit_rate' train_nit_regularization = nit_penalty * train_nit_rate train_nit_regularization.name = 'nit_regularization' # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) mask_parameters = [p for p in trainable_parameters if get_brick(p).name == 'mask'] noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) nonmask_weights = [p for p in weights if get_brick(p).name != 'mask'] l2_norm = sum([(W ** 2).sum() for W in nonmask_weights]) l2_norm.name = 'l2_norm' l2_regularization = weight_decay * l2_norm l2_regularization.name = 'l2_regularization' # testversion test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + l2_regularization + train_nit_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 128 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # Create a step rule that reduces the learning rate of noise scale_mask = Restrict(noise_step_rule, mask_parameters) step_rule = CompositeRule([scale_mask, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) exp_name = save_to.replace('.%d', '') # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (50, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), EpochSchedule(noise_step_rule.learning_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4) ]), EpochSchedule(noise_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4), # (6, 3e-4), # (8, 1e-3), # Causes nit rate to jump # (10, 3e-3), # (12, 1e-2), # (15, 3e-2), # (19, 1e-1), # (24, 3e-1), # (30, 1) ]), NoiseExtension( noise_parameters=noise_parameters), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_regularization, train_nit_regularization, momentum.learning_rate, train_mean_log_sigma, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + exp_name, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_nit_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ['train_mean_log_sigma'], ], every_n_batches=17), Plot('Test performance for ' + exp_name, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(exp_name, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def main(save_to, model, train, test, num_epochs, input_size = (150,150), learning_rate=0.01, batch_size=50, num_batches=None, flatten_stream=False): """ save_to : where to save trained model model : model given in input must be already initialised (works with convnet and mlp) input_size : the shape of the reshaped image in input (before flattening is applied if flatten_stream is True) """ if flatten_stream : x = tensor.matrix('image_features') else : x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') #Data augmentation #insert data augmentation here #Generating stream train_stream = DataStream.default_stream( train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size) ) test_stream = DataStream.default_stream( test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size) ) #Reshaping procedure #Add a crop option in scikitresize so that the image is not deformed #Resize to desired square shape train_stream = ScikitResize(train_stream, input_size, which_sources=('image_features',)) test_stream = ScikitResize(test_stream, input_size, which_sources=('image_features',)) #Flattening the stream if flatten_stream is True: train_stream = Flatten(train_stream, which_sources=('image_features',)) test_stream = Flatten(test_stream, which_sources=('image_features',)) # Apply input to model probs = model.apply(x) #Defining cost and various indices to watch #print(probs) #cost = SquaredError().apply(y.flatten(),probs) cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost') error_rate = MisclassificationRate().apply(y.flatten(), probs).copy( name='error_rate') #Building Computation Graph cg = ComputationGraph([cost, error_rate]) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learning_rate)) #Defining extensions extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), TrainingDataMonitoring([cost, error_rate,aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=5), DataStreamMonitoring([cost, error_rate],test_stream,prefix="test", every_n_batches=25), Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=5)] # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. model = Model(cost) main_loop = MainLoop( algorithm, train_stream, model=model, extensions=extensions) main_loop.run()
def start(self): xx = T.matrix('features', config.floatX) yy = T.imatrix('targets') zm = BNUM*(xx.shape[0]//BNUM) x = xx[:zm].reshape((BNUM, zm//BNUM, xx.shape[1])).dimshuffle(1, 0, 2) y = yy[:zm].reshape((BNUM, zm//BNUM)).dimshuffle(1, 0) # x = xx[:zm].reshape((zm//16, 16, xx.shape[1])) # y = yy[:zm].reshape((zm//16, 16)) DIMS = [108*5, 200, 200, 200, LABEL] NUMS = [1, 1, 1, 1, 1] # DIMS = [108*5, 48] # NUMS = [1, 1] FUNCS = [ Rectifier, Rectifier, Rectifier, # Rectifier, # Rectifier, # Maxout(num_pieces=5), # Maxout(num_pieces=5), # Maxout(num_pieces=5), # SimpleRecurrent, # SimpleRecurrent, # SimpleRecurrent, # LSTM, # LSTM, # LSTM, # SequenceGenerator, # Softmax, None, ] def lllistool(i, inp, func): if func == LSTM: NUMS[i+1] *= 4 sdim = DIMS[i] if func == SimpleRecurrent or func == LSTM: sdim = DIMS[i] + DIMS[i+1] l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], weights_init=IsotropicGaussian(std=sdim**(-0.5)), biases_init=IsotropicGaussian(std=sdim**(-0.5)), name='Lin{}'.format(i)) l.initialize() if func == SimpleRecurrent: gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() ret = gong.apply(l.apply(inp)) elif func == LSTM: gong = func(dim=DIMS[i+1], activation=Tanh(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() print(inp) ret, _ = gong.apply( l.apply(inp), T.zeros((inp.shape[1], DIMS[i+1])), T.zeros((inp.shape[1], DIMS[i+1])), ) elif func == SequenceGenerator: gong = func( readout=None, transition=SimpleRecurrent(dim=100, activation=Rectifier(), weights_init=IsotropicGaussian(std=0.1))) ret = None elif func == None: ret = l.apply(inp) else: gong = func() ret = gong.apply(l.apply(inp)) return ret oup = x for i in range(len(DIMS)-1): oup = lllistool(i, oup, FUNCS[i]) y_hat = oup y_rsp = y.reshape((y.shape[0]*y.shape[1],)) y_dsf_rsp = y.dimshuffle(1, 0).reshape((y.shape[0]*y.shape[1],)) yh_rsp = y_hat.reshape((y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2])) yh_dsf_rsp = y_hat.dimshuffle(1, 0, 2).reshape((y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2])) sfmx = Softmax().apply(yh_rsp) # cost = CategoricalCrossEntropy().apply(y, y_hat).astype(config.floatX) # j, wlh = Yimumu(y_hat, y) # cost = CategoricalCrossEntropy().apply(y_rsp, sfmx) + j cost = CategoricalCrossEntropy().apply(y_rsp, sfmx) # cost_p = cost_p.astype(config.floatX) # cost = CTC_cost(y, y_hat) cost = cost.astype(config.floatX) cg = ComputationGraph(cost) # cg_p = ComputationGraph(cost_p) orig_cg = cg ips = VariableFilter(roles=[INPUT])(cg.variables) ops = VariableFilter(roles=[OUTPUT])(cg.variables) # print(ips, ops) # cg = apply_dropout(cg, ips[0:2:1], 0.2) # cg = apply_dropout(cg, ips[2:-2:1], 0.5) # cost = cg.outputs[0].astype(config.floatX) cost.name = 'cost' mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)])) # yh_dsf_rsp = theano.printing.Print('YapYapYap')(yh_dsf_rsp) # z_hat = T.argmax(yh_dsf_rsp[:,:-1], axis=1) z_hat = T.argmax(yh_dsf_rsp, axis=1) # z_hat = theano.printing.Print('Yap')(z_hat) # z_hat = Yimumu_Decode()(y_hat, wlh) z_hat_hat = CTC_Decode()(y_hat) y39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[y_dsf_rsp]) y_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat]) y_hat_hat39 = y_hat39 # y_hat_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat_hat]) # trm = TrimOp()(y_hat_hat39) # trm = trm[1:1+trm[0]] # trm = theano.printing.Print('Trm')(trm) lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) lost01.name = '0/1 loss' lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) lost23.name = '2/3 loss' edit01 = EditDistance()(y39, y_hat_hat39).astype(config.floatX) #+ T.sum(trm) * 1E-10 # edit01 = edit01.astype(config.floatX) edit01.name = '0/1 edit' edit23 = EditDistance()(y39, y_hat_hat39).astype(config.floatX) edit23.name = '2/3 edit' Ws = cg.parameters # Ws = Ws + [wlh] print(list(Ws)) norms = sum(w.norm(2) for w in Ws) norms = norms.astype(config.floatX) norms.name = 'norms' path = pjoin(PATH['fuel'], 'train_train.hdf5') data = H5PYDataset(path, which_set='train', load_in_memory=True, subset=slice(0, 100000)) # data = H5PYDataset(path, which_set='train', load_in_memory=True) data_v = H5PYDataset(pjoin(PATH['fuel'], 'train_validate.hdf5'), which_set='validate', load_in_memory=True) num = data.num_examples data_stream = DataStream(data, iteration_scheme=ShuffledScheme( num, batch_size=SLEN*BNUM)) data_stream_v = DataStream(data_v, iteration_scheme=SequentialScheme( data_v.num_examples, batch_size=SLEN*BNUM)) algo = GradientDescent(cost=cost, params=Ws, step_rule=CompositeRule([ Momentum(0.005, 0.9) # AdaDelta() ])) # algo_p = GradientDescent(cost=cost_p, params=cg_p.parameters, step_rule=CompositeRule([ # Momentum(0.01, 0.9) # # AdaDelta() # ])) monitor = DataStreamMonitoring( variables=[cost, lost01, edit01, norms], data_stream=data_stream) monitor_v = DataStreamMonitoring( variables=[lost23, edit23], data_stream=data_stream_v) plt = Plot('AlpYap', channels=[['0/1 loss', '2/3 loss'], ['0/1 edit', '2/3 edit']], after_epoch=True) # main_loop_p = MainLoop(data_stream = data_stream, # algorithm=algo_p, # extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=10), Printing(), plt]) # main_loop_p.run() main_loop = MainLoop(data_stream = data_stream, algorithm=algo, extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=2000), Printing(), plt]) main_loop.run() pfile = open('zzz.pkl', 'wb') pickle.dump(orig_cg, pfile) # pickle.dump(wlh, pfile) pfile.close() ################ test_feat = np.load(pjoin(PATH['numpy'], 'train_test_features.npy')).astype(config.floatX) func = theano.function([xx], y_hat.astype(config.floatX)) test_hat = [] for i in range(19): tmp = func(test_feat[i*10000:(i+1)*10000]) tmp = tmp.transpose((1, 0, 2)).reshape((tmp.shape[0]*tmp.shape[1], tmp.shape[2])) test_hat.append(tmp) test_hat = np.concatenate(test_hat, axis=0) test_hat = np.concatenate((test_hat, np.zeros((2, LABEL))), axis=0) alpha = T.tensor3(config.floatX) beta = alpha.argmax(axis=2) # beta = alpha[:,:,:-1].argmax(axis=2) # beta = Yimumu_Decode()(alpha, wlh) # beta = CTC_Decode()(alpha) func2 = theano.function([alpha], beta) lens = [] tags = [] with shelve.open(SHELVE['test']) as f: names = f['names'] for n in names: lens.append(len(f[n])) for i in range(lens[-1]): tags.append(n+'_'+str(i+1)) seq = [] seq2 = [] nowcnt = 0 for i in lens: nxt = nowcnt + i cur_hat = test_hat[nowcnt:nxt].reshape((i, 1, LABEL)).astype(config.floatX) nowcnt = nxt fc2 = func2(cur_hat).flatten() fc3 = [] fc4 = [] for j in fc2: fc3.append(ph48239(id2ph(j))) fc4.append(ph2c(ph48239(id2ph(j)))) seq.append(fc3) seq2.append(''.join(trim(fc4))) seq_flat = np.concatenate(seq) with open('hw1_outz.txt', 'w') as f: f.write('id,prediction\n') for t, i in zip(tags, seq_flat): f.write(t+','+i+'\n') with open('hw2_outz.txt', 'w') as f: f.write('id,phone_sequence\n') for n, i in zip(names, seq2): f.write(n+','+i+'\n')
Convolutional(filter_size=(1, 1), num_filters=1024, name='Convx3'), Rectifier(), MaxPooling((2, 2), name='MaxPol2'), Convolutional(filter_size=(1, 1), num_filters=2, name='Convx4'), Rectifier(), ]) conv_sequence1 = ConvolutionalSequence(conv_layers1, num_channels=512, image_size=(10, 10), weights_init=Orthogonal(), use_bias=False, name='ConvSeq3') conv_sequence1.initialize() out_soft1 = Flattener(name='Flatt1').apply(conv_sequence1.apply(out5)) predict1 = NDimensionalSoftmax(name='Soft1').apply(out_soft1) cost1 = CategoricalCrossEntropy(name='Cross1').apply( y.flatten(), predict1).copy(name='cost1') #SECOND SOFTMAX conv_layers2 = list([ MaxPooling((2, 2), name='MaxPol2'), Convolutional(filter_size=(1, 1), num_filters=128, name='Convx21'), Rectifier(), MaxPooling((2, 2), name='MaxPol11'), Convolutional(filter_size=(1, 1), num_filters=1024, name='Convx31'), Rectifier(), MaxPooling((2, 2), name='MaxPol21'), Convolutional(filter_size=(1, 1), num_filters=2, name='Convx41'), Rectifier(), ]) conv_sequence2 = ConvolutionalSequence(conv_layers2, num_channels=832,
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError( 'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)' ) z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, ) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 4, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d' % l, ogates_zoneout=ogates_zoneout) for l in range(num_layers) ] elif rnn_type.lower() == 'gru': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 3, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d' % l) for l in range(num_layers) ] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d' % l) for l in range(num_layers) ] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply( rnn_embedding, zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost / np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states' % l in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states' % l: norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x * x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [ cost_train, cost, bpc, perp, learning_rate, aggregation.mean( algorithm.total_gradient_norm).copy("gradient_norm_mean") ] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append( RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value() / lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError( 'Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=l.get_dim("output"), output_dim=10, weights_init=IsotropicGaussian(std=0.01), biases_init=IsotropicGaussian(std=0.01)) l.initialize() o = l.apply(o) o = Softmax().apply(o) Y = T.imatrix(name="targets") cost = CategoricalCrossEntropy().apply(Y.flatten(), o) cost.name = "cost" miss_class = 1.0 - MisclassificationRate().apply(Y.flatten(), o) miss_class.name = "accuracy" cg = ComputationGraph(cost) print cg.shared_variables bricks = [get_brick(var) for var in cg.variables if get_brick(var)] for i, b in enumerate(bricks): b.name += str(i) step_rule = AdaM() algorithm = GradientDescent(cost=cost, step_rule=step_rule)
def _create_main_loop(self): # hyper parameters hp = self.params batch_size = hp['batch_size'] biases_init = Constant(0) batch_normalize = hp['batch_normalize'] ### Build fprop tensor5 = T.TensorType(config.floatX, (False, ) * 5) X = tensor5("images") #X = T.tensor4("images") y = T.lvector('targets') gnet_params = OrderedDict() #X_shuffled = X[:, :, :, :, [2, 1, 0]] #X_shuffled = gpu_contiguous(X.dimshuffle(0, 1, 4, 2, 3)) * 255 X = X[:, :, :, :, [2, 1, 0]] X_shuffled = X.dimshuffle((0, 1, 4, 2, 3)) * 255 X_r = X_shuffled.reshape( (X_shuffled.shape[0], X_shuffled.shape[1] * X_shuffled.shape[2], X_shuffled.shape[3], X_shuffled.shape[4])) X_r = X_r - (np.array([104, 117, 123])[None, :, None, None]).astype('float32') expressions, input_data, param = stream_layer_exp(inputs=('data', X_r), mode='rgb') res = expressions['outloss'] y_hat = res.flatten(ndim=2) import pdb pdb.set_trace() ### Build Cost cost = CategoricalCrossEntropy().apply(y, y_hat) cost = T.cast(cost, theano.config.floatX) cost.name = 'cross_entropy' y_pred = T.argmax(y_hat, axis=1) misclass = T.cast(T.mean(T.neq(y_pred, y)), theano.config.floatX) misclass.name = 'misclass' monitored_channels = [] monitored_quantities = [cost, misclass, y_hat, y_pred] model = Model(cost) training_cg = ComputationGraph(monitored_quantities) inference_cg = ComputationGraph(monitored_quantities) ### Get evaluation function #training_eval = training_cg.get_theano_function(additional_updates=bn_updates) training_eval = training_cg.get_theano_function() #inference_eval = inference_cg.get_theano_function() # Dataset test = JpegHDF5Dataset( 'test', #name='jpeg_data_flows.hdf5', load_in_memory=True) #mean = np.load(os.path.join(os.environ['UCF101'], 'mean.npy')) import pdb pdb.set_trace() ### Eval labels = np.zeros(test.num_video_examples) y_hat = np.zeros((test.num_video_examples, 101)) labels_flip = np.zeros(test.num_video_examples) y_hat_flip = np.zeros((test.num_video_examples, 101)) ### Important to shuffle list for batch normalization statistic #rng = np.random.RandomState() #examples_list = range(test.num_video_examples) #import pdb; pdb.set_trace() #rng.shuffle(examples_list) nb_frames = 1 for i in xrange(24): scheme = HDF5SeqScheme(test.video_indexes, examples=test.num_video_examples, batch_size=batch_size, f_subsample=i, nb_subsample=25, frames_per_video=nb_frames) #for crop in ['upleft', 'upright', 'downleft', 'downright', 'center']: for crop in ['center']: stream = JpegHDF5Transformer( input_size=(240, 320), crop_size=(224, 224), #input_size=(256, 342), crop_size=(224, 224), crop_type=crop, translate_labels=True, flip='noflip', nb_frames=nb_frames, data_stream=ForceFloatX( DataStream(dataset=test, iteration_scheme=scheme))) stream_flip = JpegHDF5Transformer( input_size=(240, 320), crop_size=(224, 224), #input_size=(256, 342), crop_size=(224, 224), crop_type=crop, translate_labels=True, flip='flip', nb_frames=nb_frames, data_stream=ForceFloatX( DataStream(dataset=test, iteration_scheme=scheme))) ## Do the evaluation epoch = stream.get_epoch_iterator() for j, batch in enumerate(epoch): output = training_eval(batch[0], batch[1]) # import cv2 # cv2.imshow('img', batch[0][0, 0, :, :, :]) # cv2.waitKey(160) # cv2.destroyAllWindows() #import pdb; pdb.set_trace() labels_flip[batch_size * j:batch_size * (j + 1)] = batch[1] y_hat_flip[batch_size * j:batch_size * (j + 1), :] += output[2] preds = y_hat_flip.argmax(axis=1) misclass = np.sum(labels_flip != preds) / float(len(preds)) print i, crop, "flip Misclass:", misclass epoch = stream_flip.get_epoch_iterator() for j, batch in enumerate(epoch): output = training_eval(batch[0], batch[1]) labels[batch_size * j:batch_size * (j + 1)] = batch[1] y_hat[batch_size * j:batch_size * (j + 1), :] += output[2] preds = y_hat.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print i, crop, "noflip Misclass:", misclass y_merge = y_hat + y_hat_flip preds = y_merge.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print i, crop, "avg Misclass:", misclass ### Compute misclass y_hat += y_hat_flip preds = y_hat.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print "Misclass:", misclass
def main(feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=None, num_batches=None): if feature_maps is None: feature_maps = [32, 48, 64, 96, 96, 128] if mlp_hiddens is None: mlp_hiddens = [1000] if conv_sizes is None: conv_sizes = [9, 7, 5, 3, 2, 1] if pool_sizes is None: pool_sizes = [2, 2, 2, 2, 1, 1] if batch_size is None: batch_size = 64 conv_steps=[2, 1, 1, 1, 1, 1] #same as stride image_size = (128, 128) output_size = 2 learningRate = 0.001 drop_prob = 0.4 weight_noise = 0.75 num_epochs = 150 num_batches = None host_plot='http://*****:*****@ %s' % (graph_name, datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) PLOT_AVAILABLE = True except ImportError: PLOT_AVAILABLE = False extensions.append(Checkpoint(save_to, after_epoch=True, after_training=True, save_separately=['log'])) logger.info("Building the model") model = Model(cost) ########### Loading images ##################### main_loop = MainLoop( algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def apply(self, input_labeled, target_labeled, input_unlabeled): self.layer_counter = 0 input_dim = self.p.encoder_layers[0] # Store the dimension tuples in the same order as layers. layers = self.layers self.layer_dims = {0: input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) self.costs = costs = AttributeDict() self.costs.denois = AttributeDict() self.act = AttributeDict() self.error = AttributeDict() top = len(layers) - 1 N = input_labeled.shape[0] self.join = lambda l, u: T.concatenate([l, u], axis=0) self.labeled = lambda x: x[:N] if x is not None else x self.unlabeled = lambda x: x[N:] if x is not None else x self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) input_concat = self.join(input_labeled, input_unlabeled) def encoder(input_, path_name, input_noise_std=0, noise_std=[]): h = input_ logger.info(' 0: noise %g' % input_noise_std) if input_noise_std > 0.: h = h + self.noise_like(h) * input_noise_std d = AttributeDict() d.unlabeled = self.new_activation_dict() d.labeled = self.new_activation_dict() d.labeled.z[0] = self.labeled(h) d.unlabeled.z[0] = self.unlabeled(h) prev_dim = input_dim for i, (spec, _, act_f) in layers[1:]: d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h) noise = noise_std[i] if i < len(noise_std) else 0. curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f, path_name=path_name, noise_std=noise) assert self.layer_dims.get(i) in (None, curr_dim) self.layer_dims[i] = curr_dim d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z) d.unlabeled.s[i] = s d.unlabeled.m[i] = m prev_dim = curr_dim d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h) return d # Clean, supervised logger.info('Encoder: clean, labeled') clean = self.act.clean = encoder(input_concat, 'clean') # Corrupted, supervised logger.info('Encoder: corr, labeled') corr = self.act.corr = encoder(input_concat, 'corr', input_noise_std=self.p.super_noise_std, noise_std=self.p.f_local_noise_std) est = self.act.est = self.new_activation_dict() # Decoder path in opposite order logger.info('Decoder: z_corr -> z_est') for i, ((_, spec), l_type, act_f) in layers[::-1]: z_corr = corr.unlabeled.z[i] z_clean = clean.unlabeled.z[i] z_clean_s = clean.unlabeled.s.get(i) z_clean_m = clean.unlabeled.m.get(i) fspec = layers[i + 1][1][0] if len(layers) > i + 1 else (None, None) if i == top: ver = corr.unlabeled.h[i] ver_dim = self.layer_dims[i] top_g = True else: ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], l_type=l_type, num=i, fspec=fspec, top_g=top_g) if z_est is not None: # Denoising cost if z_clean_s: z_est_norm = (z_est - z_clean_m) / z_clean_s else: z_est_norm = z_est se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) denois_print = 'denois %.2f' % self.p.denoising_cost_x[i] else: denois_print = '' # Store references for later use est.h[i] = self.apply_act(z_est, act_f) est.z[i] = z_est est.s[i] = None est.m[i] = None logger.info(' g%d: %10s, %s, dim %s -> %s' % (i, l_type, denois_print, self.layer_dims.get(i + 1), self.layer_dims.get(i))) # Costs y = target_labeled.flatten() costs.class_clean = CategoricalCrossEntropy().apply( y, clean.labeled.h[top]) costs.class_clean.name = 'cost_class_clean' costs.class_corr = CategoricalCrossEntropy().apply( y, corr.labeled.h[top]) costs.class_corr.name = 'cost_class_corr' # This will be used for training costs.total = costs.class_corr * 1.0 for i in range(top + 1): if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0: costs.total += costs.denois[i] * self.p.denoising_cost_x[i] costs.total.name = 'cost_total' # Classification error mr = MisclassificationRate() self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.error.clean.name = 'error_rate_clean'