def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5("features") tensor3 = theano.tensor.TensorType(config.floatX, (False,) * 3) locs = tensor3("locs") # shape: B x Classes target = T.ivector("targets") model = LSTMAttention(configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, alpha, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply( input_, locs ) model.location = location model.scale = scale model.alpha = location model.patch = patch classifier = MLP( [Rectifier(), Softmax()], configs["classifier_dims"], weights_init=Glorot(), biases_init=Constant(0) ) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) cost.name = "CE" error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = "ER" model.cost = cost model.error_rate = error_rate model.probabilities = probabilities if configs["load_pretrained"]: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open("VGG_CNN_params.npz") as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs["test_model"]: print "TESTING THE MODEL: CHECK THE INPUT SIZE!" cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input="ignore", allow_input_downcast=True) data = configs["get_streams"](configs["batch_size"])[0].get_epoch_iterator().next() f(data[1], data[0], data[2]) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
def main(save_to, num_epochs, bokeh=False): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if bokeh: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), model=Model(cost), extensions=extensions) main_loop.run()
def apply(self, input_, target): mlp = MLP(self.non_lins, self.dims, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name=self.name) mlp.initialize() probs = mlp.apply(T.flatten(input_, outdim=2)) probs.name = 'probs' cost = CategoricalCrossEntropy().apply(target.flatten(), probs) cost.name = "CE" self.outputs = {} self.outputs['probs'] = probs self.outputs['cost'] = cost
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHTS])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent(cost=cost, step_rule=SteepestDescent(learning_rate=0.1)) main_loop = MainLoop( mlp, DataStream(mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 50)), algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_every_epoch=True), SerializeMainLoop(save_to), Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']]), Printing() ]) main_loop.run()
def main(save_to, num_epochs, batch_size): mlp = MLP([Tanh(), Tanh(), Tanh(), Softmax()], [3072, 4096, 1024, 512, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tt.tensor4('features', dtype='float32') y = tt.vector('label', dtype='int32') probs = mlp.apply(x.reshape((-1, 3072))) cost = CategoricalCrossEntropy().apply(y, probs) error_rate = MisclassificationRate().apply(y, probs) cg = ComputationGraph([cost]) ws = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * sum(([(w**2).sum() for w in ws])) cost.name = 'final_cost' train_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=True) valid_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=False) train_stream = train_dataset.get_stream(batch_size) valid_stream = valid_dataset.get_stream(batch_size) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) extensions = [ Timing(), LogExtension('/home/belohlavek/ALI/mlp.log'), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], valid_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] main_loop = MainLoop(algorithm, train_stream, model=Model(cost), extensions=extensions) main_loop.run()
def main(save_to, num_epochs, batch_size): mlp = MLP([Tanh(), Tanh(), Tanh(), Softmax()], [3072, 4096, 1024, 512, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tt.tensor4('features', dtype='float32') y = tt.vector('label', dtype='int32') probs = mlp.apply(x.reshape((-1,3072))) cost = CategoricalCrossEntropy().apply(y, probs) error_rate = MisclassificationRate().apply(y, probs) cg = ComputationGraph([cost]) ws = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * sum(([(w**2).sum() for w in ws])) cost.name = 'final_cost' train_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=True) valid_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=False) train_stream = train_dataset.get_stream(batch_size) valid_stream = valid_dataset.get_stream(batch_size) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) extensions = [Timing(), LogExtension('/home/belohlavek/ALI/mlp.log'), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], valid_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] main_loop = MainLoop(algorithm, train_stream, model=Model(cost), extensions=extensions) main_loop.run()
def main(job_id, params, config_file='params.ec'): config = ConfigParser.ConfigParser() config.readfp(open('./configs/{}'.format(config_file))) pr = pprint.PrettyPrinter(indent=4) pr.pprint(config) net_name = config.get('hyperparams', 'net_name', 'adni') struct_name = net_name.split('_')[0] max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float(config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') input_dim = input_dims[struct_name] # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[ input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent( step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('{}_{}'.format(net_name, side), channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}/{}/{}'.format(struct_name, side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit: # And by that I mean if the means of the val error and training error over the # previous 'epochs' is greater than the 'threshold', we are overfitting. early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.05, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
h1, sd = rnn.apply(pre_rnn[:, :, :h_dim], pre_rnn[:, :, h_dim:], drops, is_for_test) h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=y_dim) pre_softmax = h1_to_o.apply(h1) softmax = Softmax() shape = pre_softmax.shape softmax_out = softmax.apply(pre_softmax.reshape((-1, y_dim))) softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y, softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y, softmax_out[-1]) error_rate.name = 'error_rate' # Initialization for brick in (x_to_h1, h1_to_o, rnn): brick.weights_init = Glorot() brick.biases_init = Constant(0) brick.initialize() train_stream = get_stream('train', batch_size, h_dim, False) data = train_stream.get_epoch_iterator(as_dict=True).next() cg = ComputationGraph(cost) f = theano.function(cg.inputs, cost) print f(data['y'], data['x'], data['is_for_test'], data['drops'])
def run(epochs=1, corpus="data/", HIDDEN_DIMS=100, path="./"): brown = BrownDataset(corpus) INPUT_DIMS = brown.get_vocabulary_size() OUTPUT_DIMS = brown.get_vocabulary_size() # These are theano variables x = tensor.lmatrix('context') y = tensor.ivector('output') # Construct the graph input_to_hidden = LookupTable(name='input_to_hidden', length=INPUT_DIMS, dim=HIDDEN_DIMS) # Compute the weight matrix for every word in the context and then compute # the average. h = tensor.mean(input_to_hidden.apply(x), axis=1) hidden_to_output = Linear(name='hidden_to_output', input_dim=HIDDEN_DIMS, output_dim=OUTPUT_DIMS) y_hat = Softmax().apply(hidden_to_output.apply(h)) # And initialize with random varibales and set the bias vector to 0 weights = IsotropicGaussian(0.01) input_to_hidden.weights_init = hidden_to_output.weights_init = weights input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # And now the cost function cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' mini_batch = SequentialScheme(brown.num_instances(), 512) data_stream = DataStream.default_stream(brown, iteration_scheme=mini_batch) # Now we tie up lose ends and construct the algorithm for the training # and define what happens in the main loop. algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ ProgressBar(), FinishAfter(after_n_epochs=epochs), Printing(), # TrainingDataMonitoring(variables=[cost]), SaveWeights(layers=[input_to_hidden, hidden_to_output], prefixes=['%sfirst' % path, '%ssecond' % path]), # Plot( # 'Word Embeddings', # channels=[ # [ # 'cost_with_regularization' # ] # ]) ] logger.info("Starting main loop...") main = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=extensions) main.run() pickle.dump(cg, open('%scg.pickle' % path, 'wb'))
def build_submodel(input_shape, output_dim, L_dim_conv_layers, L_filter_size, L_pool_size, L_activation_conv, L_dim_full_layers, L_activation_full, L_exo_dropout_conv_layers, L_exo_dropout_full_layers, L_endo_dropout_conv_layers, L_endo_dropout_full_layers, L_border_mode=None, L_filter_step=None, L_pool_step=None): # TO DO : target size and name of the features x = T.tensor4('features') y = T.imatrix('targets') assert len(input_shape) == 3, "input_shape must be a 3d tensor" num_channels = input_shape[0] image_size = tuple(input_shape[1:]) print image_size print num_channels prediction = output_dim # CONVOLUTION output_conv = x output_dim = num_channels*np.prod(image_size) conv_layers = [] assert len(L_dim_conv_layers) == len(L_filter_size) if L_filter_step is None: L_filter_step = [None] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_pool_size) if L_pool_step is None: L_pool_step = [None] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_pool_step) assert len(L_dim_conv_layers) == len(L_activation_conv) if L_border_mode is None: L_border_mode = ["valid"] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_border_mode) assert len(L_dim_conv_layers) == len(L_endo_dropout_conv_layers) assert len(L_dim_conv_layers) == len(L_exo_dropout_conv_layers) # regarding the batch dropout : the dropout is applied on the filter # which is equivalent to the output dimension # you have to look at the dropout_rate of the next layer # that is why we need to have the first dropout value of L_exo_dropout_full_layers # the first value has to be 0.0 in this context, and we'll # assume that it is, but let's have an assert assert L_exo_dropout_conv_layers[0] == 0.0, "L_exo_dropout_conv_layers[0] has to be 0.0 in this context. There are ways to make it work, of course, but we don't support this with this scripts." # here modifitication of L_exo_dropout_conv_layers L_exo_dropout_conv_layers = L_exo_dropout_conv_layers[1:] + [L_exo_dropout_full_layers[0]] if len(L_dim_conv_layers): for (num_filters, filter_size, filter_step, pool_size, pool_step, activation_str, border_mode, dropout, index) in zip(L_dim_conv_layers, L_filter_size, L_filter_step, L_pool_size, L_pool_step, L_activation_conv, L_border_mode, L_exo_dropout_conv_layers, xrange(len(L_dim_conv_layers)) ): # convert filter_size and pool_size in tuple filter_size = tuple(filter_size) if filter_step is None: filter_step = (1, 1) else: filter_step = tuple(filter_step) if pool_size is None: pool_size = (0,0) else: pool_size = tuple(pool_size) # TO DO : leaky relu if activation_str.lower() == 'rectifier': activation = Rectifier().apply elif activation_str.lower() == 'tanh': activation = Tanh().apply elif activation_str.lower() in ['sigmoid', 'logistic']: activation = Logistic().apply elif activation_str.lower() in ['id', 'identity']: activation = Identity().apply else: raise Exception("unknown activation function : %s", activation_str) assert 0.0 <= dropout and dropout < 1.0 num_filters = num_filters - int(num_filters*dropout) print "border_mode : %s" % border_mode # filter_step # http://blocks.readthedocs.org/en/latest/api/bricks.html#module-blocks.bricks.conv kwargs = {} if filter_step is None or filter_step == (1,1): pass else: # there's a bit of a mix of names because `Convolutional` takes # a "step" argument, but `ConvolutionActivation` takes "conv_step" argument kwargs['conv_step'] = filter_step if (pool_size[0] == 0 and pool_size[1] == 0): layer_conv = ConvolutionalActivation(activation=activation, filter_size=filter_size, num_filters=num_filters, border_mode=border_mode, name="layer_%d" % index, **kwargs) else: if pool_step is None: pass else: kwargs['pooling_step'] = tuple(pool_step) layer_conv = ConvolutionalLayer(activation=activation, filter_size=filter_size, num_filters=num_filters, border_mode=border_mode, pooling_size=pool_size, name="layer_%d" % index, **kwargs) conv_layers.append(layer_conv) convnet = ConvolutionalSequence(conv_layers, num_channels=num_channels, image_size=image_size, weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="conv_section") convnet.push_allocation_config() convnet.initialize() output_dim = np.prod(convnet.get_dim('output')) output_conv = convnet.apply(output_conv) output_conv = Flattener().apply(output_conv) # FULLY CONNECTED output_mlp = output_conv full_layers = [] assert len(L_dim_full_layers) == len(L_activation_full) assert len(L_dim_full_layers) + 1 == len(L_endo_dropout_full_layers) assert len(L_dim_full_layers) + 1 == len(L_exo_dropout_full_layers) # reguarding the batch dropout : the dropout is applied on the filter # which is equivalent to the output dimension # you have to look at the dropout_rate of the next layer # that is why we throw away the first value of L_exo_dropout_full_layers L_exo_dropout_full_layers = L_exo_dropout_full_layers[1:] pre_dim = output_dim print "When constructing the model, the output_dim of the conv section is %d." % output_dim if len(L_dim_full_layers): for (dim, activation_str, dropout, index) in zip(L_dim_full_layers, L_activation_full, L_exo_dropout_full_layers, range(len(L_dim_conv_layers), len(L_dim_conv_layers)+ len(L_dim_full_layers)) ): # TO DO : leaky relu if activation_str.lower() == 'rectifier': activation = Rectifier().apply elif activation_str.lower() == 'tanh': activation = Tanh().apply elif activation_str.lower() in ['sigmoid', 'logistic']: activation = Logistic().apply elif activation_str.lower() in ['id', 'identity']: activation = Identity().apply else: raise Exception("unknown activation function : %s", activation_str) assert 0.0 <= dropout and dropout < 1.0 dim = dim - int(dim*dropout) print "When constructing the fully-connected section, we apply dropout %f to add an MLP going from pre_dim %d to dim %d." % (dropout, pre_dim, dim) layer_full = MLP(activations=[activation], dims=[pre_dim, dim], weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="layer_%d" % index) layer_full.initialize() full_layers.append(layer_full) pre_dim = dim for layer in full_layers: output_mlp = layer.apply(output_mlp) output_dim = L_dim_full_layers[-1] - int(L_dim_full_layers[-1]*L_exo_dropout_full_layers[-1]) # COST FUNCTION output_layer = Linear(output_dim, prediction, weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="layer_"+str(len(L_dim_conv_layers)+ len(L_dim_full_layers)) ) output_layer.initialize() full_layers.append(output_layer) y_pred = output_layer.apply(output_mlp) y_hat = Softmax().apply(y_pred) # SOFTMAX and log likelihood y_pred = Softmax().apply(y_pred) # be careful. one version expects the output of a softmax; the other expects just the # output of the network cost = CategoricalCrossEntropy().apply(y.flatten(), y_pred) #cost = Softmax().categorical_cross_entropy(y.flatten(), y_pred) cost.name = "cost" # Misclassification error_rate_brick = MisclassificationRate() error_rate = error_rate_brick.apply(y.flatten(), y_hat) error_rate.name = "error_rate" # put names D_params, D_kind = build_params(x, T.matrix(), conv_layers, full_layers) # test computation graph cg = ComputationGraph(cost) # DROPOUT L_endo_dropout = L_endo_dropout_conv_layers + L_endo_dropout_full_layers cg_dropout = cg inputs = VariableFilter(roles=[INPUT])(cg.variables) for (index, drop_rate) in enumerate(L_endo_dropout): for input_ in inputs: m = re.match(r"layer_(\d+)_apply.*", input_.name) if m and index == int(m.group(1)): if drop_rate < 0.0001: print "Skipped applying dropout on %s because the dropout rate was under 0.0001." % input_.name break else: cg_dropout = apply_dropout(cg, [input_], drop_rate) print "Applied dropout %f on %s." % (drop_rate, input_.name) break cg = cg_dropout return (cg, error_rate, cost, D_params, D_kind)
# Define a cost function to optimize, and a classification error rate: # Also apply the outputs from the net, and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # Need to define the computation graph: graph = ComputationGraph(cost) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(graph.variables) # Add some regularization to this model: lam = 0.001 cost += lam * l2_norm(W) cost.name = 'entropy' # This is the model without dropout, but with l2 reg. model = Model(cost) # Apply dropout to inputs: graph = ComputationGraph(y_hat) inputs = VariableFilter([INPUT])(graph.variables) dropout_graph = apply_dropout(graph, inputs, 0.2) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm: algo = GradientDescent( # step_rule=Scale(learning_rate=0.1), #step_rule=AdaGrad(),
def main(save_to, num_epochs): mlp = MLP([Tanh(), Tanh(), Softmax()], [784, 100, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost, error_rate]) cost.name = 'final_cost' test_cost = cost for_dropout = VariableFilter(roles=[INPUT], bricks=mlp.linear_transformations[1:])(cg.variables) dropout_graph = apply_dropout(cg, for_dropout, 0.5) dropout_graph = apply_dropout(dropout_graph, [x], 0.1) dropout_cost, dropout_error_rate = dropout_graph.outputs mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) algorithm = GradientDescent( cost=dropout_cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), TrainingDataMonitoring( [dropout_cost, dropout_error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(dropout_cost), extensions=extensions) main_loop.run()
def create_network(inputs=None, batch=batch_size): if inputs is None: inputs = T.tensor4('features') x = T.cast(inputs, 'float32') x = x / 255. if dataset != 'binarized_mnist' else x # PixelCNN architecture conv_list = [ ConvolutionalNoFlip(*first_layer, mask='A', name='0'), Rectifier() ] for i in range(n_layer): conv_list.extend([ ConvolutionalNoFlip(*second_layer, mask='B', name=str(i + 1)), Rectifier() ]) conv_list.extend([ ConvolutionalNoFlip((1, 1), h * n_channel, mask='B', name=str(n_layer + 1)), Rectifier() ]) conv_list.extend([ ConvolutionalNoFlip((1, 1), h * n_channel, mask='B', name=str(n_layer + 2)), Rectifier() ]) conv_list.extend( [ConvolutionalNoFlip(*third_layer, mask='B', name=str(n_layer + 3))]) sequence = ConvolutionalSequence(conv_list, num_channels=n_channel, batch_size=batch, image_size=(img_dim, img_dim), border_mode='half', weights_init=IsotropicGaussian(std=0.05, mean=0), biases_init=Constant(0.02), tied_biases=False) sequence.initialize() x = sequence.apply(x) if MODE == '256ary': x = x.reshape( (-1, 256, n_channel, img_dim, img_dim)).dimshuffle(0, 2, 3, 4, 1) x = x.reshape((-1, 256)) x_hat = Softmax().apply(x) inp = T.cast(inputs, 'int64').flatten() cost = CategoricalCrossEntropy().apply(inp, x_hat) * img_dim * img_dim cost_bits_dim = categorical_crossentropy(log_softmax(x), inp) else: x_hat = Logistic().apply(x) cost = BinaryCrossEntropy().apply(inputs, x_hat) * img_dim * img_dim #cost = T.nnet.binary_crossentropy(x_hat, inputs) #cost = cost.sum() / inputs.shape[0] cost_bits_dim = -(inputs * T.log2(x_hat) + (1.0 - inputs) * T.log2(1.0 - x_hat)).mean() cost_bits_dim.name = "nnl_bits_dim" cost.name = 'loglikelihood_nat' return cost, cost_bits_dim
def create_network(inputs=None, batch=batch_size): if inputs is None: inputs = T.tensor4('features') x = T.cast(inputs, 'float32') x = x / 255. if dataset != 'binarized_mnist' else x # GatedPixelCNN gated = GatedPixelCNN(name='gated_layer_0', filter_size=7, image_size=(img_dim, img_dim), num_filters=h * n_channel, num_channels=n_channel, batch_size=batch, weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), res=False) gated.initialize() x_v, x_h = gated.apply(x, x) for i in range(n_layer): gated = GatedPixelCNN(name='gated_layer_{}'.format(i + 1), filter_size=3, image_size=(img_dim, img_dim), num_channels=h * n_channel, batch_size=batch, weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), res=True) gated.initialize() x_v, x_h = gated.apply(x_v, x_h) conv_list = [] conv_list.extend([ Rectifier(), ConvolutionalNoFlip((1, 1), h * n_channel, mask_type='B', name='1x1_conv_1') ]) #conv_list.extend([Rectifier(), ConvolutionalNoFlip((1,1), h*n_channel, mask='B', name='1x1_conv_2')]) conv_list.extend([ Rectifier(), ConvolutionalNoFlip(*third_layer, mask_type='B', name='output_layer') ]) sequence = ConvolutionalSequence(conv_list, num_channels=h * n_channel, batch_size=batch, image_size=(img_dim, img_dim), border_mode='half', weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), tied_biases=False) sequence.initialize() x = sequence.apply(x_h) if MODE == '256ary': x = x.reshape( (-1, 256, n_channel, img_dim, img_dim)).dimshuffle(0, 2, 3, 4, 1) x = x.reshape((-1, 256)) x_hat = Softmax().apply(x) inp = T.cast(inputs, 'int64').flatten() cost = CategoricalCrossEntropy().apply(inp, x_hat) * img_dim * img_dim cost_bits_dim = categorical_crossentropy(log_softmax(x), inp) else: x_hat = Logistic().apply(x) cost = BinaryCrossEntropy().apply(inputs, x_hat) * img_dim * img_dim #cost = T.nnet.binary_crossentropy(x_hat, inputs) #cost = cost.sum() / inputs.shape[0] cost_bits_dim = -(inputs * T.log2(x_hat) + (1.0 - inputs) * T.log2(1.0 - x_hat)).mean() cost_bits_dim.name = "nnl_bits_dim" cost.name = 'loglikelihood_nat' return cost, cost_bits_dim
hidden_to_output = Linear(name="hidden_to_output", input_dim=50, output_dim=10) y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix("targets") from blocks.bricks.cost import CategoricalCrossEntropy cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) from blocks.roles import WEIGHT from blocks.graph import ComputationGraph from blocks.filter import VariableFilter cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = "cost_with_regularization" from blocks.initialization import IsotropicGaussian, Constant input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() from blocks.algorithms import GradientDescent, Scale algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) from blocks.extensions.monitoring import DataStreamMonitoring monitor = DataStreamMonitoring(variables=[cost], data_stream=dst, prefix="test")
def main(model_path, recurrent_type): dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, _make_target, add_sources=('target',)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(100)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') target = tensor.lmatrix('target') target_mask = tensor.matrix('target_mask') dim = 100 lookup = LookupTable(len(all_chars), dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) if recurrent_type == 'lstm': rnn = LSTM(dim / 4, Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) elif recurrent_type == 'simple': rnn = SimpleRecurrent(dim, Tanh()) rnn = Bidirectional(rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) else: raise ValueError('Not known RNN type') rnn.initialize() lookup.initialize() y_hat = rnn.apply(lookup.apply(features), mask=features_mask) print len(all_chars) linear = Linear(2 * dim, len(all_chars), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.)) linear.initialize() y_hat = linear.apply(y_hat) seq_lenght = y_hat.shape[0] batch_size = y_hat.shape[1] y_hat = Softmax().apply(y_hat.reshape((seq_lenght * batch_size, -1))).reshape(y_hat.shape) cost = CategoricalCrossEntropy().apply( target.flatten(), y_hat.reshape((-1, len(all_chars)))) * seq_lenght * batch_size cost.name = 'cost' cost_per_character = cost / features_mask.sum() cost_per_character.name = 'cost_per_character' cg = ComputationGraph([cost, cost_per_character]) model = Model(cost) algorithm = GradientDescent(step_rule=Adam(), cost=cost, params=cg.parameters) train_monitor = TrainingDataMonitoring( [cost, cost_per_character], prefix='train', after_batch=True) extensions = [train_monitor, Printing(every_n_batches=40), Dump(model_path, every_n_batches=200), #Checkpoint('rnn.pkl', every_n_batches=200) ] main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=data_stream, extensions=extensions) main_loop.run()
draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix(u'features') y = tensor.lmatrix(u'targets') #y = theano.tensor.extra_ops.to_one_hot(tensor.lmatrix(u'targets'),2) #probs, h_enc, c_enc, i_dec, h_dec, c_dec, center_y, center_x, delta = draw.reconstruct(x) probs, h_enc, c_enc, center_y, center_x, delta = draw.reconstruct(x) trim_probs = probs[-1,:,:] #Only take information from the last iteration labels = y.flatten() #cost = BinaryCrossEntropy().apply(labels, trim_probs) cost = CategoricalCrossEntropy().apply(y, trim_probs) error_rate = MisclassificationRate().apply(labels, trim_probs) cost.name = "CCE" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]), on_unused_sources='ignore', #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') #attention ---> patch_shape = (16, 16) image_shape = (784, 100) import numpy import theano.tensor as T n_spatial_dims = 2 cropper = SoftRectangularCropper(n_spatial_dims=n_spatial_dims, patch_shape=patch_shape, image_shape=image_shape, kernel=Gaussian()) batch_size = 10 scales = 1.3**numpy.arange(-7, 6) n_patches = len(scales) locations = (numpy.ones( (n_patches, batch_size, 2)) * image_shape / 2).astype(numpy.float32) scales = numpy.tile(scales[:, numpy.newaxis, numpy.newaxis], (1, batch_size, 2)).astype(numpy.float32) Tpatches = T.stack(*[ cropper.apply(x, T.constant(location), T.constant(scale))[0] for location, scale in zip(locations, scales) ]) patches = theano.function([x], Tpatches)(batch['features']) import ipdb as pdb pdb.set_trace() probs = mlp.apply(tensor.flatten(patches, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
o = Rectifier().apply(o) l = Linear(input_dim=l.get_dim("output"), output_dim=10, weights_init=IsotropicGaussian(std=0.01), biases_init=IsotropicGaussian(std=0.01)) l.initialize() o = l.apply(o) o = Softmax().apply(o) Y = T.imatrix(name="targets") cost = CategoricalCrossEntropy().apply(Y.flatten(), o) cost.name = "cost" miss_class = 1.0 - MisclassificationRate().apply(Y.flatten(), o) miss_class.name = "accuracy" cg = ComputationGraph(cost) print cg.shared_variables bricks = [get_brick(var) for var in cg.variables if get_brick(var)] for i, b in enumerate(bricks): b.name += str(i) step_rule = AdaM() algorithm = GradientDescent(cost=cost, step_rule=step_rule)
def train_net(net, train_stream, test_stream, L1=None, L2=None, early_stopping=False, finish=None, dropout=False, jobid=None, update=None, duration=None, **ignored): x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') y_hat = net.apply(x) #Cost cost_before = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost_before.name = "cost_without_regularization" #Error #Taken from brodesf error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = "Misclassification rate" #Regularization cg = ComputationGraph(cost_before) WS = VariableFilter(roles=[WEIGHT])(cg.variables) if dropout: print("Dropout") cg = apply_dropout(cg, WS, 0.5) if L1: print("L1 with lambda ", L1) L1_reg = L1 * sum([abs(W).sum() for W in WS]) L1_reg.name = "L1 regularization" cost_before += L1_reg if L2: print("L2 with lambda ", L2) L2_reg = L2 * sum([(W**2).sum() for W in WS]) L2_reg.name = "L2 regularization" cost_before += L2_reg cost = cost_before cost.name = 'cost_with_regularization' #Initialization print("Initilization") net.initialize() #Algorithm step_rule = Scale(learning_rate=0.1) if update is not None: if update == "rmsprop": print("Using RMSProp") step_rule = RMSProp() remove_not_finite = RemoveNotFinite(0.9) step_rule = CompositeRule([step_rule, remove_not_finite]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) print("Extensions") extensions = [] #Monitoring monitor = DataStreamMonitoring(variables=[cost, error], data_stream=test_stream, prefix="test") extensions.append(monitor) def filename(suffix=""): prefix = jobid if jobid else str(os.getpid()) ctime = str(time.time()) return "checkpoints/" + prefix + "_" + ctime + "_" + suffix + ".zip" #Serialization #serialization = Checkpoint(filename()) #extensions.append(serialization) notification = "test_" + error.name track = TrackTheBest(notification) best_notification = track.notification_name checkpointbest = SaveBest(best_notification, filename("best")) extensions.extend([track, checkpointbest]) if early_stopping: print("Early stopping") stopper = FinishIfNoImprovementAfterPlus(best_notification) extensions.append(stopper) #Other extensions if finish != None: print("Force finish ", finish) extensions.append(FinishAfter(after_n_epochs=finish)) if duration != None: print("Stop after ", duration, " seconds") extensions.append(FinishAfterTime(duration)) extensions.extend([Timing(), Printing()]) #Main loop main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) print("Main loop start") main_loop.run()
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') #attention ---> patch_shape = (16, 16); image_shape = (784,100); import numpy import theano.tensor as T n_spatial_dims = 2 cropper = SoftRectangularCropper(n_spatial_dims=n_spatial_dims, patch_shape=patch_shape, image_shape=image_shape, kernel=Gaussian()) batch_size = 10 scales = 1.3**numpy.arange(-7, 6) n_patches = len(scales) locations = (numpy.ones((n_patches, batch_size, 2)) * image_shape/2).astype(numpy.float32) scales = numpy.tile(scales[:, numpy.newaxis, numpy.newaxis], (1, batch_size, 2)).astype(numpy.float32) Tpatches = T.stack(*[cropper.apply(x, T.constant(location), T.constant(scale))[0] for location, scale in zip(locations, scales)]) patches = theano.function([x], Tpatches)(batch['features']) import ipdb as pdb; pdb.set_trace() probs = mlp.apply(tensor.flatten(patches, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run()
def _create_main_loop(self): # hyper parameters hp = self.params batch_size = hp['batch_size'] biases_init = Constant(0) batch_normalize = hp['batch_normalize'] ### Build fprop tensor5 = T.TensorType(config.floatX, (False, ) * 5) X = tensor5("images") #X = T.tensor4("images") y = T.lvector('targets') gnet_params = OrderedDict() #X_shuffled = X[:, :, :, :, [2, 1, 0]] #X_shuffled = gpu_contiguous(X.dimshuffle(0, 1, 4, 2, 3)) * 255 X = X[:, :, :, :, [2, 1, 0]] X_shuffled = X.dimshuffle((0, 1, 4, 2, 3)) * 255 X_r = X_shuffled.reshape( (X_shuffled.shape[0], X_shuffled.shape[1] * X_shuffled.shape[2], X_shuffled.shape[3], X_shuffled.shape[4])) X_r = X_r - (np.array([104, 117, 123])[None, :, None, None]).astype('float32') expressions, input_data, param = stream_layer_exp(inputs=('data', X_r), mode='rgb') res = expressions['outloss'] y_hat = res.flatten(ndim=2) import pdb pdb.set_trace() ### Build Cost cost = CategoricalCrossEntropy().apply(y, y_hat) cost = T.cast(cost, theano.config.floatX) cost.name = 'cross_entropy' y_pred = T.argmax(y_hat, axis=1) misclass = T.cast(T.mean(T.neq(y_pred, y)), theano.config.floatX) misclass.name = 'misclass' monitored_channels = [] monitored_quantities = [cost, misclass, y_hat, y_pred] model = Model(cost) training_cg = ComputationGraph(monitored_quantities) inference_cg = ComputationGraph(monitored_quantities) ### Get evaluation function #training_eval = training_cg.get_theano_function(additional_updates=bn_updates) training_eval = training_cg.get_theano_function() #inference_eval = inference_cg.get_theano_function() # Dataset test = JpegHDF5Dataset( 'test', #name='jpeg_data_flows.hdf5', load_in_memory=True) #mean = np.load(os.path.join(os.environ['UCF101'], 'mean.npy')) import pdb pdb.set_trace() ### Eval labels = np.zeros(test.num_video_examples) y_hat = np.zeros((test.num_video_examples, 101)) labels_flip = np.zeros(test.num_video_examples) y_hat_flip = np.zeros((test.num_video_examples, 101)) ### Important to shuffle list for batch normalization statistic #rng = np.random.RandomState() #examples_list = range(test.num_video_examples) #import pdb; pdb.set_trace() #rng.shuffle(examples_list) nb_frames = 1 for i in xrange(24): scheme = HDF5SeqScheme(test.video_indexes, examples=test.num_video_examples, batch_size=batch_size, f_subsample=i, nb_subsample=25, frames_per_video=nb_frames) #for crop in ['upleft', 'upright', 'downleft', 'downright', 'center']: for crop in ['center']: stream = JpegHDF5Transformer( input_size=(240, 320), crop_size=(224, 224), #input_size=(256, 342), crop_size=(224, 224), crop_type=crop, translate_labels=True, flip='noflip', nb_frames=nb_frames, data_stream=ForceFloatX( DataStream(dataset=test, iteration_scheme=scheme))) stream_flip = JpegHDF5Transformer( input_size=(240, 320), crop_size=(224, 224), #input_size=(256, 342), crop_size=(224, 224), crop_type=crop, translate_labels=True, flip='flip', nb_frames=nb_frames, data_stream=ForceFloatX( DataStream(dataset=test, iteration_scheme=scheme))) ## Do the evaluation epoch = stream.get_epoch_iterator() for j, batch in enumerate(epoch): output = training_eval(batch[0], batch[1]) # import cv2 # cv2.imshow('img', batch[0][0, 0, :, :, :]) # cv2.waitKey(160) # cv2.destroyAllWindows() #import pdb; pdb.set_trace() labels_flip[batch_size * j:batch_size * (j + 1)] = batch[1] y_hat_flip[batch_size * j:batch_size * (j + 1), :] += output[2] preds = y_hat_flip.argmax(axis=1) misclass = np.sum(labels_flip != preds) / float(len(preds)) print i, crop, "flip Misclass:", misclass epoch = stream_flip.get_epoch_iterator() for j, batch in enumerate(epoch): output = training_eval(batch[0], batch[1]) labels[batch_size * j:batch_size * (j + 1)] = batch[1] y_hat[batch_size * j:batch_size * (j + 1), :] += output[2] preds = y_hat.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print i, crop, "noflip Misclass:", misclass y_merge = y_hat + y_hat_flip preds = y_merge.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print i, crop, "avg Misclass:", misclass ### Compute misclass y_hat += y_hat_flip preds = y_hat.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print "Misclass:", misclass
bias=bias, name="lstm") h, c = lstm.apply(x_transform) h_to_o = Linear(name='h_to_o', input_dim=h_dim, output_dim=o_dim) o = h_to_o.apply(h) o = NDimensionalSoftmax().apply(o, extra_ndim=1) for brick in (lstm, x_to_h, h_to_o): brick.weights_init = Glorot() brick.biases_init = Constant(0) brick.initialize() cost = CategoricalCrossEntropy().apply(y, o) cost.name = 'CE' print 'Bulding training process...' shapes = [] for param in ComputationGraph(cost).parameters: # shapes.append((param.name, param.eval().shape)) shapes.append(np.prod(list(param.eval().shape))) print "Total number of parameters: " + str(np.sum(shapes)) if not os.path.exists(save_path): os.makedirs(save_path) log_path = save_path + '/log.txt' fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh)
def start(self): x = T.matrix('features', config.floatX) y = T.imatrix('targets') self.x = x DIMS = [108 * 5, 1000, 1000, 1000, 1000, 1943] NUMS = [1, 1, 1, 1, 1, 1] FUNCS = [ Rectifier, Rectifier, Rectifier, Rectifier, # Rectifier, # Maxout(num_pieces=5), # Maxout(num_pieces=5), # Maxout(num_pieces=5), # SimpleRecurrent, # SimpleRecurrent, # SimpleRecurrent, Softmax, ] def lllistool(i, inp, func): l = Linear(input_dim=DIMS[i], output_dim=DIMS[i + 1] * NUMS[i + 1], weights_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), biases_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), name='Lin{}'.format(i)) l.initialize() func.name = 'Fun{}'.format(i) if func == SimpleRecurrent: gong = func(dim=DIMS[i + 1], activation=Rectifier(), weights_init=IsotropicGaussian( std=(DIMS[i] + DIMS[i + 1])**(-0.5))) else: gong = func() ret = gong.apply(l.apply(inp)) return ret oup = x for i in range(len(DIMS) - 1): oup = lllistool(i, oup, FUNCS[i]) y_hat = oup self.y_hat_prob = y_hat cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat).astype(config.floatX) cg = ComputationGraph(cost) orig_cg = cg ips = VariableFilter(roles=[INPUT])(cg.variables) ops = VariableFilter(roles=[OUTPUT])(cg.variables) cg = apply_dropout(cg, ips[0:2:1], 0.2) cg = apply_dropout(cg, ips[2:-2:1], 0.5) cost = cg.outputs[0] cost.name = 'cost' # mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)])) mps = theano.shared(np.array([ph2id(state239(t)) for t in range(1943)])) z_hat = T.argmax(y_hat, axis=1) y39, _ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[y.flatten()]) y_hat39, _ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat]) self.y_hat39 = y_hat39 lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype( config.floatX) lost01.name = '0/1 loss' lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype( config.floatX) #lost23 = MisclassificationRate().apply(y39, y_hat39).astype(config.floatX) lost23.name = '2/3 loss' Ws = VariableFilter(roles=[WEIGHT])(cg.variables) norms = sum(w.norm(2) for w in Ws) norms.name = 'norms' path = pjoin(PATH['fuel'], pfx + '_train.hdf5') data = H5PYDataset(path, which_set='train', load_in_memory=True, subset=slice(0, 100000)) # data = H5PYDataset(path, which_set='train', load_in_memory=True) data_v = H5PYDataset(pjoin(PATH['fuel'], pfx + '_validate.hdf5'), which_set='validate', load_in_memory=True) num = data.num_examples data_stream = DataStream(data, iteration_scheme=ShuffledScheme( num, batch_size=128)) data_stream_v = DataStream(data_v, iteration_scheme=SequentialScheme( data_v.num_examples, batch_size=128)) algo = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([Momentum(0.002, 0.9)])) monitor = DataStreamMonitoring(variables=[cost, lost01, norms], data_stream=data_stream) monitor_v = DataStreamMonitoring(variables=[lost23], data_stream=data_stream_v) plt = Plot('AlpAlpAlp', channels=[['0/1 loss', '2/3 loss']], after_epoch=True) main_loop = MainLoop(data_stream=data_stream, algorithm=algo, extensions=[ monitor, monitor_v, FinishAfter(after_n_epochs=2000), Printing(), plt ]) main_loop.run()
y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix('targets') from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate cost = CategoricalCrossEntropy().apply(y, y_hat) error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) error_rate.name = "error_rate" # >>> from blocks.roles import WEIGHT from blocks.graph import ComputationGraph # >>> from blocks.filter import VariableFilter cg = ComputationGraph(cost) # >>> W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) # >>> cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() # >>> cost.name = 'cost_with_regularization' cost.name = 'cost_simple_xentropy' from blocks.initialization import IsotropicGaussian, Constant input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() from fuel.streams import DataStream from fuel.schemes import SequentialScheme, SequentialExampleScheme # >>> from fuel.transformers import Flatten data_stream = DataStream.default_stream( training_dataset, iteration_scheme=SequentialScheme(training_dataset.num_examples, batch_size=20)) data_stream_test = DataStream.default_stream(
def main(): parser = argparse.ArgumentParser() parser.add_argument("every_n_batches", type=int, default=[1], nargs=1) args = parser.parse_args() print("We were asked to sync with legion at every_n_batches = %s" % str(args.every_n_batches[0])) # The rest is a copy paste from the blocks tutorial, except for the inclusion of the sync extension # at the creation of the MainLoop blocks object. x = tensor.matrix('features') input_to_hidden = Linear(name='input_to_hidden', input_dim=784, output_dim=100) h = Rectifier().apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name='hidden_to_output', input_dim=100, output_dim=10) y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix('targets') cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() mnist = MNIST(("train",)) data_stream = Flatten( DataStream.default_stream( mnist, iteration_scheme=SequentialScheme(mnist.num_examples, batch_size=256))) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=Scale(learning_rate=0.1) ) mnist_test = MNIST(("test",)) data_stream_test = Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, batch_size=1024))) monitor = DataStreamMonitoring(variables=[cost], data_stream=data_stream_test, prefix="test") # Except for this line b1, b2 = VariableFilter(roles=[BIAS])(cg.variables) main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=[monitor, FinishAfter(after_n_epochs=500), Printing(), # And the inclusion of the legion sync module, SharedParamsRateLimited: SharedParamsRateLimited( params={"W1": W1, "W2": W2, "b1": b1, "b2": b2 }, alpha=.5, beta=.5, every_n_batches=args.every_n_batches[0], maximum_rate=0.1)]) main_loop.run()
y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix('targets') from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate cost = CategoricalCrossEntropy().apply(y, y_hat) error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) error_rate.name = "error_rate" # >>> from blocks.roles import WEIGHT from blocks.graph import ComputationGraph # >>> from blocks.filter import VariableFilter cg = ComputationGraph(cost) # >>> W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) # >>> cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() # >>> cost.name = 'cost_with_regularization' cost.name = 'cost_simple_xentropy' from blocks.initialization import IsotropicGaussian, Constant input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian( 0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() from fuel.streams import DataStream from fuel.schemes import SequentialScheme, SequentialExampleScheme # >>> from fuel.transformers import Flatten data_stream = DataStream.default_stream(training_dataset, iteration_scheme=SequentialScheme( training_dataset.num_examples, batch_size=20))
def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5('features') tensor3 = theano.tensor.TensorType(config.floatX, (False,) * 3) locs = tensor3('locs') # shape: B x Classes target = T.ivector('targets') model = LSTMAttention( configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, alpha, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply(input_, locs) model.location = location model.scale = scale model.alpha = location model.patch = patch classifier = MLP( [Rectifier(), Softmax()], configs['classifier_dims'], weights_init=Glorot(), biases_init=Constant(0)) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) cost.name = 'CE' error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = 'ER' model.cost = cost model.error_rate = error_rate model.probabilities = probabilities if configs['load_pretrained']: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open('VGG_CNN_params.npz') as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs['test_model']: print "TESTING THE MODEL: CHECK THE INPUT SIZE!" cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input='ignore', allow_input_downcast=True) data = configs['get_streams'](configs[ 'batch_size'])[0].get_epoch_iterator().next() f(data[1], data[0], data[2]) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
hidden_to_output = Linear(name='hidden_to_output', input_dim=100, output_dim=10) y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix('targets') from blocks.bricks.cost import CategoricalCrossEntropy cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) from blocks.roles import WEIGHT from blocks.graph import ComputationGraph from blocks.filter import VariableFilter cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' from blocks.bricks import MLP mlp = MLP(activations=[Rectifier(), Softmax()], dims=[784, 100, 10]).apply(x) from blocks.initialization import IsotropicGaussian, Constant input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() from fuel.datasets import MNIST mnist = MNIST(("train",))
def shroom_mlp(shrooms_train, shrooms_test, num_epochs, hidden_dims, activation_function): # These are theano variables x = tensor.matrix('features') y = tensor.lmatrix('targets') # Construct the graph input_to_hidden = Linear(name='input_to_hidden', input_dim=117, output_dim=hidden_dims) h = activation_function.apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_dims, output_dim=2) y_hat = Softmax().apply(hidden_to_output.apply(h)) # And initialize with random varibales and set the bias vector to 0 weights = IsotropicGaussian(0.01) input_to_hidden.weights_init = hidden_to_output.weights_init = weights input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # And now the cost function cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) # Not needed for now: W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) # The data streams give us access to our corpus and allow us to perform a # mini-batch training. data_stream = Flatten(DataStream.default_stream( shrooms_train, iteration_scheme=SequentialScheme(shrooms_train.num_examples, batch_size=128))) test_data_stream = Flatten(DataStream.default_stream( shrooms_test, iteration_scheme=SequentialScheme(shrooms_test.num_examples, batch_size=1000))) extensions = [ ProgressBar(), PlotWeights(after_epoch=True, folder="results_logistic_40_interpolation", computation_graph=cg, folder_per_layer=True, dpi=150), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring(variables=[cost, error_rate], data_stream=test_data_stream, prefix="test"), Printing() ] # Now we tie up lose ends and construct the algorithm for the training # and define what happens in the main loop. algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) main = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=extensions) main.run() return 0
hidden = tensor.mean(w1.apply(Xs), axis=1) y_hat = Softmax().apply(w2.apply(hidden)) w1.weights_init = w2.weights_init = IsotropicGaussian(0.01) w1.biases_init = w2.biases_init = Constant(0) w1.initialize() w2.initialize() cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1**2).sum() + 0.005 * (W2**2).sum() cost.name = "loss" # # the actual training of the model # main = MainLoop( data_stream=DataStream.default_stream(dataset, iteration_scheme=SequentialScheme( dataset.num_instances, batch_size=512)), algorithm=GradientDescent(cost=cost, parameters=cg.parameters, step_rule=AdaGrad()), extensions=[ ProgressBar(), #FinishAfter(after_n_epochs=10),
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float( config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427} data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent(step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_{}'.format(side), channels=[ ['dropout_entropy', 'validation_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}net/{}'.format(side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.1, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
o = Rectifier().apply(o) l = Linear(input_dim=l.get_dim("output"), output_dim=10, weights_init=IsotropicGaussian(std=0.01), biases_init=IsotropicGaussian(std=0.01)) l.initialize() o = l.apply(o) o = Softmax().apply(o) Y = T.imatrix(name="targets") cost = CategoricalCrossEntropy().apply(Y.flatten(), o) cost.name = "cost" miss_class = 1.0 - MisclassificationRate().apply(Y.flatten(), o) miss_class.name = "accuracy" cg = ComputationGraph(cost) print cg.shared_variables bricks = [get_brick(var) for var in cg.variables if get_brick(var)] for i, b in enumerate(bricks): b.name += str(i) step_rule = AdaM() algorithm = GradientDescent(cost=cost, step_rule=step_rule) print "Loading data"
x_to_h1 = Linear(name='x_to_h1', input_dim=x_dim, output_dim=h_dim) pre_rnn = x_to_h1.apply(x) rnn = SimpleRecurrent(activation=Rectifier(), dim=h_dim, name="rnn") h1 = rnn.apply(pre_rnn) h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=o_dim) pre_softmax = h1_to_o.apply(h1) softmax = Softmax() shape = pre_softmax.shape softmax_out = softmax.apply(pre_softmax.reshape((-1, o_dim))) softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1]) error_rate.name = 'error_rate' # Initialization for brick in (x_to_h1, h1_to_o): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize() rnn.weights_init = Identity() rnn.biases_init = Constant(0) rnn.initialize() print 'Bulding training process...' algorithm = GradientDescent(cost=cost, parameters=ComputationGraph(cost).parameters,
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to, save_separately=['log'], after_batch=True), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run() import cPickle import pandas with open('mnist_log.pkl') as f: log = cPickle.load(f) data_frame = pandas.DataFrame.from_dict(log, orient='index')
mlp.initialize() # Setting up the cost function from blocks.bricks.cost import CategoricalCrossEntropy cost = CategoricalCrossEntropy().apply(T.flatten(), Y) from blocks.roles import WEIGHT from blocks.graph import ComputationGraph from blocks.filter import VariableFilter cg = ComputationGraph(cost) print(VariableFilter(roles=[WEIGHT])(cg.variables)) W1, W2, W3 = VariableFilter(roles=[WEIGHT])(cg.variables) # cost with L2 regularization cost = cost + 0.005 * (W2 ** 2).sum() + 0.005 * (W3 ** 2).sum() cost.name = 'cost_with_regularization' #print(cg.variables) #print(VariableFilter(roles=[WEIGHT])(cg.variables)) # Use Blocks to train this network from blocks.algorithms import GradientDescent, Scale from blocks.extensions import Printing from blocks.extensions.monitoring import TrainingDataMonitoring from blocks.main_loop import MainLoop algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) # We want to monitor the cost as we train
def start(self): x = T.matrix('features', config.floatX) y = T.imatrix('targets') self.x = x DIMS = [108*5, 1000, 1000, 1000, 1000, 1943] NUMS = [1, 1, 1, 1, 1, 1] FUNCS = [ Rectifier, Rectifier, Rectifier, Rectifier, # Rectifier, # Maxout(num_pieces=5), # Maxout(num_pieces=5), # Maxout(num_pieces=5), # SimpleRecurrent, # SimpleRecurrent, # SimpleRecurrent, Softmax, ] def lllistool(i, inp, func): l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], weights_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), biases_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), name='Lin{}'.format(i)) l.initialize() func.name='Fun{}'.format(i) if func == SimpleRecurrent: gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=(DIMS[i]+DIMS[i+1])**(-0.5))) else: gong = func() ret = gong.apply(l.apply(inp)) return ret oup = x for i in range(len(DIMS)-1): oup = lllistool(i, oup, FUNCS[i]) y_hat = oup self.y_hat_prob = y_hat cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat).astype(config.floatX) cg = ComputationGraph(cost) orig_cg = cg ips = VariableFilter(roles=[INPUT])(cg.variables) ops = VariableFilter(roles=[OUTPUT])(cg.variables) cg = apply_dropout(cg, ips[0:2:1], 0.2) cg = apply_dropout(cg, ips[2:-2:1], 0.5) cost = cg.outputs[0] cost.name = 'cost' # mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)])) mps = theano.shared(np.array([ph2id(state239(t)) for t in range(1943)])) z_hat = T.argmax(y_hat, axis=1) y39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[y.flatten()]) y_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat]) self.y_hat39 = y_hat39 lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) lost01.name = '0/1 loss' lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) #lost23 = MisclassificationRate().apply(y39, y_hat39).astype(config.floatX) lost23.name = '2/3 loss' Ws = VariableFilter(roles=[WEIGHT])(cg.variables) norms = sum(w.norm(2) for w in Ws) norms.name = 'norms' path = pjoin(PATH['fuel'], pfx+'_train.hdf5') data = H5PYDataset(path, which_set='train', load_in_memory=True, subset=slice(0, 100000)) # data = H5PYDataset(path, which_set='train', load_in_memory=True) data_v = H5PYDataset(pjoin(PATH['fuel'], pfx+'_validate.hdf5'), which_set='validate', load_in_memory=True) num = data.num_examples data_stream = DataStream(data, iteration_scheme=ShuffledScheme( num, batch_size=128)) data_stream_v = DataStream(data_v, iteration_scheme=SequentialScheme( data_v.num_examples, batch_size=128)) algo = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([Momentum(0.002, 0.9)])) monitor = DataStreamMonitoring( variables=[cost, lost01, norms], data_stream=data_stream) monitor_v = DataStreamMonitoring( variables=[lost23], data_stream=data_stream_v) plt = Plot('AlpAlpAlp', channels=[['0/1 loss', '2/3 loss']], after_epoch=True) main_loop = MainLoop(data_stream = data_stream, algorithm=algo, extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=2000), Printing(), plt]) main_loop.run()
statistics_list=[(M1,S1,a1), (M2,S2,a2), (M3,S3,a3), (M4,S4,a4), (M5,S5,a5)] # initialize_variables # for variable (M,S) in variables: # compute M and S in the whole data. if normalization == 'bn2': for m,s,var in statistics_list: var.tag.aggregation_scheme = MeanAndVariance(var, var.shape[0], axis = 0) init_mn, init_var = DatasetEvaluator([var]).evaluate(stream_train)[var.name] m.set_value(init_mn.astype(floatX)) s.set_value(sqrt(init_var).astype(floatX)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) cost.name = 'cost' error_rate = MisclassificationRate().apply(y.flatten(), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) parameters = cg.parameters # add gradient descent to M,S if normalization == 'bn2': for m,s,var in statistics_list: parameters.extend([m,s]) algorithm = GradientDescent( cost=cost, parameters=parameters, step_rule=Adam(0.01)) #update the M and S with batch statistics
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) hidden_units = int(config.get('hyperparams', 'hidden_units', 16)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') fine_tune = config.getboolean('hyperparams', 'fine_tune') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13' ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45' right_dim = 10519 left_dim = 11427 train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') l_x = tensor.matrix('l_features') r_x = tensor.matrix('r_features') y = tensor.lmatrix('targets') lnet = load(ln_file).model.get_top_bricks()[0] rnet = load(rn_file).model.get_top_bricks()[0] # Pre-trained layers: # Inputs -> hidden_1 -> hidden 2 for side, net in zip(['l', 'r'], [lnet, rnet]): for child in net.children: child.name = side + '_' + child.name ll1 = lnet.children[0] lr1 = lnet.children[1] ll2 = lnet.children[2] lr2 = lnet.children[3] rl1 = rnet.children[0] rr1 = rnet.children[1] rl2 = rnet.children[2] rr2 = rnet.children[3] l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x)))) r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x)))) input_dim = ll2.output_dim + rl2.output_dim # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output output_mlp = MLP(activations=[ Rectifier(name='h3'), Rectifier(name='h4'), Softmax(name='output'), ], dims=[ input_dim, hidden_units, hidden_units, 2, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(std=W_sd, mean=W_mu)) output_mlp.initialize() # # Concatenate the inputs from the two hidden subnets into a single variable # # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) # y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # If no fine-tuning of l-r models is wanted, find the params for only # the joint layers: if fine_tune: params_to_update = dropout_graph.parameters else: params_to_update = VariableFilter( [PARAMETER], bricks=output_mlp.children)(cost_graph) # Learning Algorithm: algo = GradientDescent(step_rule=solver_type, params=params_to_update, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot( 'AdniNet_LeftRight', channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], ) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}'.format(stamp), save_separately=['model', 'log'], every_n_epochs=1) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), FinishIfNoImprovementAfter( notification_name='validation_error', epochs=1), Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) hidden_units = int(config.get('hyperparams', 'hidden_units', 16)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') fine_tune = config.getboolean('hyperparams', 'fine_tune') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13' ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45' right_dim = 10519 left_dim = 11427 train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') l_x = tensor.matrix('l_features') r_x = tensor.matrix('r_features') y = tensor.lmatrix('targets') lnet = load(ln_file).model.get_top_bricks()[0] rnet = load(rn_file).model.get_top_bricks()[0] # Pre-trained layers: # Inputs -> hidden_1 -> hidden 2 for side, net in zip(['l', 'r'], [lnet, rnet]): for child in net.children: child.name = side + '_' + child.name ll1 = lnet.children[0] lr1 = lnet.children[1] ll2 = lnet.children[2] lr2 = lnet.children[3] rl1 = rnet.children[0] rr1 = rnet.children[1] rl2 = rnet.children[2] rr2 = rnet.children[3] l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x)))) r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x)))) input_dim = ll2.output_dim + rl2.output_dim # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output output_mlp = MLP(activations=[ Rectifier(name='h3'), Rectifier(name='h4'), Softmax(name='output'), ], dims=[ input_dim, hidden_units, hidden_units, 2, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(std=W_sd, mean=W_mu)) output_mlp.initialize() # # Concatenate the inputs from the two hidden subnets into a single variable # # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) # y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # If no fine-tuning of l-r models is wanted, find the params for only # the joint layers: if fine_tune: params_to_update = dropout_graph.parameters else: params_to_update = VariableFilter([PARAMETER], bricks=output_mlp.children)(cost_graph) # Learning Algorithm: algo = GradientDescent( step_rule=solver_type, params=params_to_update, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_LeftRight', channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], ) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}'.format(stamp), save_separately=['model', 'log'], every_n_epochs=1) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), FinishIfNoImprovementAfter(notification_name='validation_error', epochs=1), Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def create_model(self, symbols_num = 500): hidden_states = self.args.encoder_hidden_dims embedding_dims = self.args.source_embeddings_dim # dimensions of sequence embeddings that are created bz bidir net, so the dimensionality is two times dim of a single net thought_dim = hidden_states * 2 #query_dims = self.args.recurrent_stack_depth * self.args.encoder_hidden_dims # batch X input symbols context = tt.lmatrix('context') context_mask = tt.matrix('context_mask') context_mask = decorate(context_mask, "context_mask",level=1) # batch X output symbols x = tt.lmatrix('question') x_mask = tt.matrix('question_mask') # answer ix for each example in the batch y = tt.lmatrix('answer') # candidate answer words for each example, batch X candidate words (10 per each example) candidates_bi = tt.lmatrix("candidates") candidates_bi_mask = tt.matrix("candidates_mask") # TODO y can contain long sequences, here we use just the first symbol of each answer (that is possibly longer) # this have to be adjusted when response can be a sequence and not only a symbol y = decorate(y, "output") y = y[:,0] ################### # create model parts ################### lookup = LookupTable(symbols_num, embedding_dims, weights_init=Uniform(width=0.2)) context_encoder = self.create_bidi_encoder("context_encoder", embedding_dims, hidden_states) question_encoder = self.create_bidi_encoder("question_encoder", embedding_dims, hidden_states) # inits lookup.initialize() #rnn.initialize() ################### # wire the model together ################### context = decorate(context, "CONTEXT",1) context_embedding_tbf = lookup.apply(context.T) #memory_encoded_btf = rnn.apply(context_embedding_tbf[:,0,:])[1] # use cells memory_encoded_btf = context_encoder.apply(context_embedding_tbf.T,context_mask).dimshuffle(1,0,2) memory_encoded_btf.name = "memory_encoded_btf" memory_encoded_btf = decorate(memory_encoded_btf,"MEM ENC") # batch X features x = decorate(x,"X") x_embedded_btf = lookup.apply(x.T) x_embedded_btf = decorate(x_embedded_btf,"QUESTION EMB") x_encoded_btf = question_encoder.apply(x_embedded_btf.T, x_mask).dimshuffle(1,0,2) x_last = x_encoded_btf[-1] # extract forward rnn that is the first in bidir encoder x_encoded_btf = decorate(x_encoded_btf,"QUESTION ENC") x_forward_encoded_bf = x_encoded_btf[:,-1,0:hidden_states] x_backward_encoded_bf = x_encoded_btf[:,0,hidden_states:hidden_states*2] query_representation_bf = tt.concatenate([x_forward_encoded_bf,x_backward_encoded_bf],axis=1) # bidirectional representation of question is used as the search key search_key = query_representation_bf #search_key = x_last #search_key = W_um.apply(x_encoded) search_key = decorate(search_key,"SEARCH KEY") mem_attention_pre = tt.batched_dot(search_key, memory_encoded_btf.dimshuffle(0,2,1)) mem_attention_pre = decorate(mem_attention_pre,"ATT presoftmax") # use masking on attention, this might be unnecessary but we do it just to be sure mem_attention_pre_masked_bt = tt.mul(mem_attention_pre,context_mask) mem_attention_pre_masked_bt = decorate(mem_attention_pre_masked_bt,"ATT presoftmax masked") #mem_attention_bt = Softmax(name="memory_query_softmax").apply(mem_attention_pre_masked_bt) mem_attention_bt = SoftmaxWithMask(name="memory_query_softmax").apply(mem_attention_pre_masked_bt,context_mask) mem_attention_bt = decorate(mem_attention_bt,"ATT",level=2) # compute weighted attention over original word vectors att_weighted_responses_bf = theano.tensor.batched_dot(mem_attention_bt, context_embedding_tbf.dimshuffle(1,0,2)) #use mask to remove the probability mass from the unmasked candidates #word_probs_bi = word_probs_bi * candidates_bi_mask # compare desired response to all candidate responses # select relevant candidate answer words candidates_embeddings_bfi = lookup.apply(candidates_bi).dimshuffle(0,2,1) # convert it to output symbol probabilities y_hat_presoft = tt.batched_dot(att_weighted_responses_bf, candidates_embeddings_bfi) y_hat = SoftmaxWithMask(name="output_softmax").apply(y_hat_presoft,candidates_bi_mask) y_hat.name = "y_hat" y_hat = decorate(y_hat,"y_hat",level=2) # the correct answer is always the first among the candidates, so we can use zeros as index of ground truth y = y.zeros_like() # cost associated with prediction error cost_prediction = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost_prediction.name = "cost_prediction" cost = cost_prediction attention_cost_weight = None cost_attention = None cost.name = "cost" predicted_response_index = tt.argmax(y_hat,axis=1) accuracy = tt.eq(y,predicted_response_index).mean() accuracy.name = "accuracy" return cost, accuracy, mem_attention_bt, y_hat, attention_cost_weight, cost_prediction, cost_attention, context, candidates_bi, candidates_bi_mask, y, context_mask, x, x_mask
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = SimpleRecurrent( dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=Adam() ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) def sampling_step(g_noise, states, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states = rnn.apply(inputs=embedding_step, states=states, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_samples [_, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def start(self): xx = T.matrix('features', config.floatX) yy = T.imatrix('targets') zm = BNUM*(xx.shape[0]//BNUM) x = xx[:zm].reshape((BNUM, zm//BNUM, xx.shape[1])).dimshuffle(1, 0, 2) y = yy[:zm].reshape((BNUM, zm//BNUM)).dimshuffle(1, 0) # x = xx[:zm].reshape((zm//16, 16, xx.shape[1])) # y = yy[:zm].reshape((zm//16, 16)) DIMS = [108*5, 200, 200, 200, LABEL] NUMS = [1, 1, 1, 1, 1] # DIMS = [108*5, 48] # NUMS = [1, 1] FUNCS = [ Rectifier, Rectifier, Rectifier, # Rectifier, # Rectifier, # Maxout(num_pieces=5), # Maxout(num_pieces=5), # Maxout(num_pieces=5), # SimpleRecurrent, # SimpleRecurrent, # SimpleRecurrent, # LSTM, # LSTM, # LSTM, # SequenceGenerator, # Softmax, None, ] def lllistool(i, inp, func): if func == LSTM: NUMS[i+1] *= 4 sdim = DIMS[i] if func == SimpleRecurrent or func == LSTM: sdim = DIMS[i] + DIMS[i+1] l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], weights_init=IsotropicGaussian(std=sdim**(-0.5)), biases_init=IsotropicGaussian(std=sdim**(-0.5)), name='Lin{}'.format(i)) l.initialize() if func == SimpleRecurrent: gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() ret = gong.apply(l.apply(inp)) elif func == LSTM: gong = func(dim=DIMS[i+1], activation=Tanh(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() print(inp) ret, _ = gong.apply( l.apply(inp), T.zeros((inp.shape[1], DIMS[i+1])), T.zeros((inp.shape[1], DIMS[i+1])), ) elif func == SequenceGenerator: gong = func( readout=None, transition=SimpleRecurrent(dim=100, activation=Rectifier(), weights_init=IsotropicGaussian(std=0.1))) ret = None elif func == None: ret = l.apply(inp) else: gong = func() ret = gong.apply(l.apply(inp)) return ret oup = x for i in range(len(DIMS)-1): oup = lllistool(i, oup, FUNCS[i]) y_hat = oup y_rsp = y.reshape((y.shape[0]*y.shape[1],)) y_dsf_rsp = y.dimshuffle(1, 0).reshape((y.shape[0]*y.shape[1],)) yh_rsp = y_hat.reshape((y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2])) yh_dsf_rsp = y_hat.dimshuffle(1, 0, 2).reshape((y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2])) sfmx = Softmax().apply(yh_rsp) # cost = CategoricalCrossEntropy().apply(y, y_hat).astype(config.floatX) # j, wlh = Yimumu(y_hat, y) # cost = CategoricalCrossEntropy().apply(y_rsp, sfmx) + j cost = CategoricalCrossEntropy().apply(y_rsp, sfmx) # cost_p = cost_p.astype(config.floatX) # cost = CTC_cost(y, y_hat) cost = cost.astype(config.floatX) cg = ComputationGraph(cost) # cg_p = ComputationGraph(cost_p) orig_cg = cg ips = VariableFilter(roles=[INPUT])(cg.variables) ops = VariableFilter(roles=[OUTPUT])(cg.variables) # print(ips, ops) # cg = apply_dropout(cg, ips[0:2:1], 0.2) # cg = apply_dropout(cg, ips[2:-2:1], 0.5) # cost = cg.outputs[0].astype(config.floatX) cost.name = 'cost' mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)])) # yh_dsf_rsp = theano.printing.Print('YapYapYap')(yh_dsf_rsp) # z_hat = T.argmax(yh_dsf_rsp[:,:-1], axis=1) z_hat = T.argmax(yh_dsf_rsp, axis=1) # z_hat = theano.printing.Print('Yap')(z_hat) # z_hat = Yimumu_Decode()(y_hat, wlh) z_hat_hat = CTC_Decode()(y_hat) y39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[y_dsf_rsp]) y_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat]) y_hat_hat39 = y_hat39 # y_hat_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat_hat]) # trm = TrimOp()(y_hat_hat39) # trm = trm[1:1+trm[0]] # trm = theano.printing.Print('Trm')(trm) lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) lost01.name = '0/1 loss' lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) lost23.name = '2/3 loss' edit01 = EditDistance()(y39, y_hat_hat39).astype(config.floatX) #+ T.sum(trm) * 1E-10 # edit01 = edit01.astype(config.floatX) edit01.name = '0/1 edit' edit23 = EditDistance()(y39, y_hat_hat39).astype(config.floatX) edit23.name = '2/3 edit' Ws = cg.parameters # Ws = Ws + [wlh] print(list(Ws)) norms = sum(w.norm(2) for w in Ws) norms = norms.astype(config.floatX) norms.name = 'norms' path = pjoin(PATH['fuel'], 'train_train.hdf5') data = H5PYDataset(path, which_set='train', load_in_memory=True, subset=slice(0, 100000)) # data = H5PYDataset(path, which_set='train', load_in_memory=True) data_v = H5PYDataset(pjoin(PATH['fuel'], 'train_validate.hdf5'), which_set='validate', load_in_memory=True) num = data.num_examples data_stream = DataStream(data, iteration_scheme=ShuffledScheme( num, batch_size=SLEN*BNUM)) data_stream_v = DataStream(data_v, iteration_scheme=SequentialScheme( data_v.num_examples, batch_size=SLEN*BNUM)) algo = GradientDescent(cost=cost, params=Ws, step_rule=CompositeRule([ Momentum(0.005, 0.9) # AdaDelta() ])) # algo_p = GradientDescent(cost=cost_p, params=cg_p.parameters, step_rule=CompositeRule([ # Momentum(0.01, 0.9) # # AdaDelta() # ])) monitor = DataStreamMonitoring( variables=[cost, lost01, edit01, norms], data_stream=data_stream) monitor_v = DataStreamMonitoring( variables=[lost23, edit23], data_stream=data_stream_v) plt = Plot('AlpYap', channels=[['0/1 loss', '2/3 loss'], ['0/1 edit', '2/3 edit']], after_epoch=True) # main_loop_p = MainLoop(data_stream = data_stream, # algorithm=algo_p, # extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=10), Printing(), plt]) # main_loop_p.run() main_loop = MainLoop(data_stream = data_stream, algorithm=algo, extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=2000), Printing(), plt]) main_loop.run() pfile = open('zzz.pkl', 'wb') pickle.dump(orig_cg, pfile) # pickle.dump(wlh, pfile) pfile.close() ################ test_feat = np.load(pjoin(PATH['numpy'], 'train_test_features.npy')).astype(config.floatX) func = theano.function([xx], y_hat.astype(config.floatX)) test_hat = [] for i in range(19): tmp = func(test_feat[i*10000:(i+1)*10000]) tmp = tmp.transpose((1, 0, 2)).reshape((tmp.shape[0]*tmp.shape[1], tmp.shape[2])) test_hat.append(tmp) test_hat = np.concatenate(test_hat, axis=0) test_hat = np.concatenate((test_hat, np.zeros((2, LABEL))), axis=0) alpha = T.tensor3(config.floatX) beta = alpha.argmax(axis=2) # beta = alpha[:,:,:-1].argmax(axis=2) # beta = Yimumu_Decode()(alpha, wlh) # beta = CTC_Decode()(alpha) func2 = theano.function([alpha], beta) lens = [] tags = [] with shelve.open(SHELVE['test']) as f: names = f['names'] for n in names: lens.append(len(f[n])) for i in range(lens[-1]): tags.append(n+'_'+str(i+1)) seq = [] seq2 = [] nowcnt = 0 for i in lens: nxt = nowcnt + i cur_hat = test_hat[nowcnt:nxt].reshape((i, 1, LABEL)).astype(config.floatX) nowcnt = nxt fc2 = func2(cur_hat).flatten() fc3 = [] fc4 = [] for j in fc2: fc3.append(ph48239(id2ph(j))) fc4.append(ph2c(ph48239(id2ph(j)))) seq.append(fc3) seq2.append(''.join(trim(fc4))) seq_flat = np.concatenate(seq) with open('hw1_outz.txt', 'w') as f: f.write('id,prediction\n') for t, i in zip(tags, seq_flat): f.write(t+','+i+'\n') with open('hw2_outz.txt', 'w') as f: f.write('id,phone_sequence\n') for n, i in zip(names, seq2): f.write(n+','+i+'\n')
def train_net(net, train_stream, test_stream, L1 = None, L2=None, early_stopping=False, finish=None, dropout=False, jobid=None, update=None, duration= None, **ignored): x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') y_hat = net.apply(x) #Cost cost_before = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost_before.name = "cost_without_regularization" #Error #Taken from brodesf error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = "Misclassification rate" #Regularization cg = ComputationGraph(cost_before) WS = VariableFilter(roles=[WEIGHT])(cg.variables) if dropout: print("Dropout") cg = apply_dropout(cg, WS, 0.5) if L1: print("L1 with lambda ",L1) L1_reg = L1 * sum([abs(W).sum() for W in WS]) L1_reg.name = "L1 regularization" cost_before += L1_reg if L2: print("L2 with lambda ",L2) L2_reg = L2 * sum([(W ** 2).sum() for W in WS]) L2_reg.name = "L2 regularization" cost_before += L2_reg cost = cost_before cost.name = 'cost_with_regularization' #Initialization print("Initilization") net.initialize() #Algorithm step_rule = Scale(learning_rate=0.1) if update is not None: if update == "rmsprop": print("Using RMSProp") step_rule = RMSProp() remove_not_finite = RemoveNotFinite(0.9) step_rule = CompositeRule([step_rule, remove_not_finite]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) print("Extensions") extensions = [] #Monitoring monitor = DataStreamMonitoring(variables=[cost, error], data_stream=test_stream, prefix="test") extensions.append(monitor) def filename(suffix=""): prefix = jobid if jobid else str(os.getpid()) ctime = str(time.time()) return "checkpoints/" + prefix + "_" + ctime + "_" + suffix + ".zip" #Serialization #serialization = Checkpoint(filename()) #extensions.append(serialization) notification = "test_"+error.name track = TrackTheBest(notification) best_notification = track.notification_name checkpointbest = SaveBest(best_notification, filename("best")) extensions.extend([track, checkpointbest]) if early_stopping: print("Early stopping") stopper = FinishIfNoImprovementAfterPlus(best_notification) extensions.append(stopper) #Other extensions if finish != None: print("Force finish ", finish) extensions.append(FinishAfter(after_n_epochs=finish)) if duration != None: print("Stop after " , duration, " seconds") extensions.append(FinishAfterTime(duration)) extensions.extend([ Timing(), Printing() ]) #Main loop main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) print("Main loop start") main_loop.run()
hidden = tensor.mean(w1.apply(Xs), axis=1) y_hat = Softmax().apply(w2.apply(hidden)) w1.weights_init = w2.weights_init = IsotropicGaussian(0.01) w1.biases_init = w2.biases_init = Constant(0) w1.initialize() w2.initialize() cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() cost.name = "loss" # # the actual training of the model # main = MainLoop(data_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme(dataset.num_instances, batch_size=512)), algorithm = GradientDescent( cost = cost, parameters = cg.parameters, step_rule = AdaGrad()), extensions = [ ProgressBar(), #FinishAfter(after_n_epochs=10),
def create_network(inputs=None, batch=batch_size): if inputs is None: inputs = T.tensor4('features') x = T.cast(inputs,'float32') x = x / 255. if dataset != 'binarized_mnist' else x # GatedPixelCNN gated = GatedPixelCNN( name='gated_layer_0', filter_size=7, image_size=(img_dim,img_dim), num_filters=h*n_channel, num_channels=n_channel, batch_size=batch, weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), res=False ) gated.initialize() x_v, x_h = gated.apply(x, x) for i in range(n_layer): gated = GatedPixelCNN( name='gated_layer_{}'.format(i+1), filter_size=3, image_size=(img_dim,img_dim), num_channels=h*n_channel, batch_size=batch, weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), res=True ) gated.initialize() x_v, x_h = gated.apply(x_v, x_h) conv_list = [] conv_list.extend([Rectifier(), ConvolutionalNoFlip((1,1), h*n_channel, mask_type='B', name='1x1_conv_1')]) #conv_list.extend([Rectifier(), ConvolutionalNoFlip((1,1), h*n_channel, mask='B', name='1x1_conv_2')]) conv_list.extend([Rectifier(), ConvolutionalNoFlip(*third_layer, mask_type='B', name='output_layer')]) sequence = ConvolutionalSequence( conv_list, num_channels=h*n_channel, batch_size=batch, image_size=(img_dim,img_dim), border_mode='half', weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), tied_biases=False ) sequence.initialize() x = sequence.apply(x_h) if MODE == '256ary': x = x.reshape((-1, 256, n_channel, img_dim, img_dim)).dimshuffle(0,2,3,4,1) x = x.reshape((-1, 256)) x_hat = Softmax().apply(x) inp = T.cast(inputs, 'int64').flatten() cost = CategoricalCrossEntropy().apply(inp, x_hat) * img_dim * img_dim cost_bits_dim = categorical_crossentropy(log_softmax(x), inp) else: x_hat = Logistic().apply(x) cost = BinaryCrossEntropy().apply(inputs, x_hat) * img_dim * img_dim #cost = T.nnet.binary_crossentropy(x_hat, inputs) #cost = cost.sum() / inputs.shape[0] cost_bits_dim = -(inputs * T.log2(x_hat) + (1.0 - inputs) * T.log2(1.0 - x_hat)).mean() cost_bits_dim.name = "nnl_bits_dim" cost.name = 'loglikelihood_nat' return cost, cost_bits_dim
def create_model(self, symbols_num = 500): # Hyperparameters # The dimension of the hidden state of the GRUs in each direction. hidden_states = self.args.encoder_hidden_dims # Dimension of the word-embedding space embedding_dims = self.args.source_embeddings_dim ################### # Declaration of the Theano variables that come from the data stream ################### # The context document. context_bt = tt.lmatrix('context') # Context document mask used to distinguish real symbols from the sequence and padding symbols that are at the end context_mask_bt = tt.matrix('context_mask') # The question question_bt = tt.lmatrix('question') question_mask_bt = tt.matrix('question_mask') # The correct answer y = tt.lmatrix('answer') y = y[:,0] # originally answers are in a 2d matrix, here we convert it to a vector # The candidates among which the answer is selected candidates_bi = tt.lmatrix("candidates") candidates_bi_mask = tt.matrix("candidates_mask") ################### # Network's components ################### # Lookup table with randomly initialized word embeddings lookup = LookupTable(symbols_num, embedding_dims, weights_init=Uniform(width=0.2)) # bidirectional encoder that translates context context_encoder = self.create_bidi_encoder("context_encoder", embedding_dims, hidden_states) # bidirectional encoder for question question_encoder = self.create_bidi_encoder("question_encoder", embedding_dims, hidden_states) # Initialize the components (where not done upon creation) lookup.initialize() ################### # Wiring the components together # # Where present, the 3 letters at the end of the variable name identify its dimensions: # b ... position of the example within the batch # t ... position of the word within the document/question # f ... features of the embedding vector ################### ### Read the context document # Map token indices to word embeddings context_embedding_tbf = lookup.apply(context_bt.T) # Read the embedded context document using the bidirectional GRU and produce the contextual embedding of each word memory_encoded_btf = context_encoder.apply(context_embedding_tbf, context_mask_bt.T).dimshuffle(1,0,2) memory_encoded_btf.name = "memory_encoded_btf" ### Correspondingly, read the query x_embedded_tbf = lookup.apply(question_bt.T) x_encoded_btf = question_encoder.apply(x_embedded_tbf, question_mask_bt.T).dimshuffle(1,0,2) # The query encoding is a concatenation of the final states of the forward and backward GRU encoder x_forward_encoded_bf = x_encoded_btf[:,-1,0:hidden_states] x_backward_encoded_bf = x_encoded_btf[:,0,hidden_states:hidden_states*2] query_representation_bf = tt.concatenate([x_forward_encoded_bf,x_backward_encoded_bf],axis=1) # Compute the attention on each word in the context as a dot product of its contextual embedding and the query mem_attention_presoft_bt = tt.batched_dot(query_representation_bf, memory_encoded_btf.dimshuffle(0,2,1)) # TODO is this pre-masking necessary? mem_attention_presoft_masked_bt = tt.mul(mem_attention_presoft_bt,context_mask_bt) # Normalize the attention using softmax mem_attention_bt = SoftmaxWithMask(name="memory_query_softmax").apply(mem_attention_presoft_masked_bt,context_mask_bt) if self.args.weighted_att: # compute weighted attention over original word vectors att_weighted_responses_bf = theano.tensor.batched_dot(mem_attention_bt, context_embedding_tbf.dimshuffle(1,0,2)) # compare desired response to all candidate responses # select relevant candidate answer words candidates_embeddings_bfi = lookup.apply(candidates_bi).dimshuffle(0,2,1) # convert it to output symbol probabilities y_hat_presoft = tt.batched_dot(att_weighted_responses_bf, candidates_embeddings_bfi) y_hat = SoftmaxWithMask(name="output_softmax").apply(y_hat_presoft,candidates_bi_mask) else: # Sum the attention of each candidate word across the whole context document, # this is the key innovation of the model # TODO: Get rid of sentence-by-sentence processing? # TODO: Rewrite into matrix notation instead of scans? def sum_prob_of_word(word_ix, sentence_ixs, sentence_attention_probs): word_ixs_in_sentence = tt.eq(sentence_ixs,word_ix).nonzero()[0] return sentence_attention_probs[word_ixs_in_sentence].sum() def sum_probs_single_sentence(candidate_indices_i, sentence_ixs_t, sentence_attention_probs_t): result, updates = theano.scan( fn=sum_prob_of_word, sequences=[candidate_indices_i], non_sequences=[sentence_ixs_t, sentence_attention_probs_t]) return result def sum_probs_batch(candidate_indices_bt,sentence_ixs_bt, sentence_attention_probs_bt): result, updates = theano.scan( fn=sum_probs_single_sentence, sequences=[candidate_indices_bt, sentence_ixs_bt, sentence_attention_probs_bt], non_sequences=None) return result # Sum the attention of each candidate word across the whole context document y_hat = sum_probs_batch(candidates_bi, context_bt, mem_attention_bt) y_hat.name = "y_hat" # We use the convention that ground truth is always at index 0, so the following are the target answers y = y.zeros_like() # We use Cross Entropy as the training objective cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost.name = "cost" predicted_response_index = tt.argmax(y_hat,axis=1) accuracy = tt.eq(y,predicted_response_index).mean() accuracy.name = "accuracy" return cost, accuracy, mem_attention_bt, y_hat, context_bt, candidates_bi, candidates_bi_mask, y, context_mask_bt, question_bt, question_mask_bt
def _create_main_loop(self): # hyper parameters hp = self.params batch_size = hp['batch_size'] biases_init = Constant(0) batch_normalize = hp['batch_normalize'] ### Build fprop tensor5 = T.TensorType(config.floatX, (False,)*5) X = tensor5("images") #X = T.tensor4("images") y = T.lvector('targets') gnet_params = OrderedDict() #X_shuffled = X[:, :, :, :, [2, 1, 0]] #X_shuffled = gpu_contiguous(X.dimshuffle(0, 1, 4, 2, 3)) * 255 X = X[:, :, :, :, [2, 1, 0]] X_shuffled = X.dimshuffle((0, 1, 4, 2, 3)) * 255 X_r = X_shuffled.reshape((X_shuffled.shape[0], X_shuffled.shape[1]*X_shuffled.shape[2], X_shuffled.shape[3], X_shuffled.shape[4])) X_r = X_r - (np.array([104, 117, 123])[None, :, None, None]).astype('float32') expressions, input_data, param = stream_layer_exp(inputs = ('data', X_r), mode='rgb') res = expressions['outloss'] y_hat = res.flatten(ndim=2) import pdb; pdb.set_trace() ### Build Cost cost = CategoricalCrossEntropy().apply(y, y_hat) cost = T.cast(cost, theano.config.floatX) cost.name = 'cross_entropy' y_pred = T.argmax(y_hat, axis=1) misclass = T.cast(T.mean(T.neq(y_pred, y)), theano.config.floatX) misclass.name = 'misclass' monitored_channels = [] monitored_quantities = [cost, misclass, y_hat, y_pred] model = Model(cost) training_cg = ComputationGraph(monitored_quantities) inference_cg = ComputationGraph(monitored_quantities) ### Get evaluation function #training_eval = training_cg.get_theano_function(additional_updates=bn_updates) training_eval = training_cg.get_theano_function() #inference_eval = inference_cg.get_theano_function() # Dataset test = JpegHDF5Dataset('test', #name='jpeg_data_flows.hdf5', load_in_memory=True) #mean = np.load(os.path.join(os.environ['UCF101'], 'mean.npy')) import pdb; pdb.set_trace() ### Eval labels = np.zeros(test.num_video_examples) y_hat = np.zeros((test.num_video_examples, 101)) labels_flip = np.zeros(test.num_video_examples) y_hat_flip = np.zeros((test.num_video_examples, 101)) ### Important to shuffle list for batch normalization statistic #rng = np.random.RandomState() #examples_list = range(test.num_video_examples) #import pdb; pdb.set_trace() #rng.shuffle(examples_list) nb_frames=1 for i in xrange(24): scheme = HDF5SeqScheme(test.video_indexes, examples=test.num_video_examples, batch_size=batch_size, f_subsample=i, nb_subsample=25, frames_per_video=nb_frames) #for crop in ['upleft', 'upright', 'downleft', 'downright', 'center']: for crop in ['center']: stream = JpegHDF5Transformer( input_size=(240, 320), crop_size=(224, 224), #input_size=(256, 342), crop_size=(224, 224), crop_type=crop, translate_labels = True, flip='noflip', nb_frames = nb_frames, data_stream=ForceFloatX(DataStream( dataset=test, iteration_scheme=scheme))) stream_flip = JpegHDF5Transformer( input_size=(240, 320), crop_size=(224, 224), #input_size=(256, 342), crop_size=(224, 224), crop_type=crop, translate_labels = True, flip='flip', nb_frames = nb_frames, data_stream=ForceFloatX(DataStream( dataset=test, iteration_scheme=scheme))) ## Do the evaluation epoch = stream.get_epoch_iterator() for j, batch in enumerate(epoch): output = training_eval(batch[0], batch[1]) # import cv2 # cv2.imshow('img', batch[0][0, 0, :, :, :]) # cv2.waitKey(160) # cv2.destroyAllWindows() #import pdb; pdb.set_trace() labels_flip[batch_size*j:batch_size*(j+1)] = batch[1] y_hat_flip[batch_size*j:batch_size*(j+1), :] += output[2] preds = y_hat_flip.argmax(axis=1) misclass = np.sum(labels_flip != preds) / float(len(preds)) print i, crop, "flip Misclass:", misclass epoch = stream_flip.get_epoch_iterator() for j, batch in enumerate(epoch): output = training_eval(batch[0], batch[1]) labels[batch_size*j:batch_size*(j+1)] = batch[1] y_hat[batch_size*j:batch_size*(j+1), :] += output[2] preds = y_hat.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print i, crop, "noflip Misclass:", misclass y_merge = y_hat + y_hat_flip preds = y_merge.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print i, crop, "avg Misclass:", misclass ### Compute misclass y_hat += y_hat_flip preds = y_hat.argmax(axis=1) misclass = np.sum(labels != preds) / float(len(preds)) print "Misclass:", misclass
def main(save_to, num_epochs, flag, ksize): batch_size = 128 dim = 100 n_steps = 20 i2h1 = MLP([Identity()], [784, dim], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) h2o1 = MLP([Rectifier(), Softmax()], [dim, dim, 10], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) rec1 = SimpleRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) i2h1.initialize() h2o1.initialize() rec1.initialize() x = tensor.tensor3('features') y = tensor.lmatrix('targets') preproc = i2h1.apply(x) h1 = rec1.apply(preproc) probs = tensor.flatten(h2o1.apply(h1[-1],), outdim=2) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cost.name = 'final_cost' error_rate.name = 'error_rate' cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST("train", subset=slice(0, 50000)) mnist_valid = MNIST("train", subset=slice(50000, 60000)) mnist_test = MNIST("test") trainstream = Mapping(Flatten(DataStream(mnist_train, iteration_scheme=SequentialScheme(50000, batch_size))), _meanize(n_steps, flag, ksize)) validstream = Mapping(Flatten(DataStream(mnist_valid, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps, flag, ksize)) teststream = Mapping(Flatten(DataStream(mnist_test, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps, flag, ksize)) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([Adam(), StepClipping(100)])) main_loop = MainLoop( algorithm, trainstream, extensions=[Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], validstream, prefix="test"), DataStreamMonitoringAndSaving( [cost, error_rate], teststream, [i2h1, h2o1, rec1], 'best_'+save_to+'.pkl', cost_name=error_rate.name, after_epoch=True, prefix='valid' ), TrainingDataMonitoring( [cost, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), # Plot( # save_to, # channels=[ # ['test_final_cost', # 'test_misclassificationrate_apply_error_rate'], # ['train_total_gradient_norm']]), Printing()]) main_loop.run()
def main(): print("Build the network") input_of_image = tensor.matrix('features') input_to_hidden = Linear(name='input_to_hidden', input_dim=784, output_dim=100) h = Tanh().apply(input_to_hidden.apply(input_of_image)) hidden_to_output = Linear(name='hidden_to_output', input_dim=100, output_dim=10) output_hat = Softmax().apply(hidden_to_output.apply(h)) output = tensor.lmatrix('targets') cost = CategoricalCrossEntropy().apply(output.flatten(), output_hat) correct_rate = 1 - MisclassificationRate().apply(output.flatten(), output_hat) correct_rate.name = 'correct_rate' print(type(correct_rate)) cost.name = 'cost' cg = ComputationGraph(cost) # Initialize the parameters input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian( 0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # Train print("Prepare the data.") mnist_train = MNIST("train") mnist_test = MNIST("test") ## Carve the data into lots of batches. data_stream_train = DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size=256)) ## Set the algorithm for the training. algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule( [Scale(0.9), StepSwitcher(0.05, 0.1)])) ## Add a monitor extension for the training. data_stream_test = DataStream( mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, batch_size=1024)) test_monitor = DataStreamMonitoring(variables=[cost, correct_rate], data_stream=data_stream_test, prefix="test", after_every_epoch=True) train_monitor = TrainingDataMonitoring( variables=[cost, correct_rate, algorithm.total_step_norm], prefix='train', after_every_batch=True) ## Add a plot monitor. plot = Plot(document='new', channels=[['train_correct_rate', 'test_correct_rate']], start_server=True, after_every_batch=True) print("Start training") main_loop = MainLoop(algorithm=algorithm, data_stream=data_stream_train, extensions=[ plot, test_monitor, train_monitor, FinishAfter(after_n_epochs=20), Printing() ]) main_loop.run()