def main(): x = tensor.matrix("features") input_to_hidden1 = get_typical_layer(x, 784, 500) #hidden1_to_hidden2 = get_typical_layer(input_to_hidden1, 500, 300) hidden1_to_latent = get_typical_layer(input_to_hidden1, 500, 20) latent_to_hidden2 = get_typical_layer(hidden1_to_latent, 20, 500) #hidden3_to_hidden4 = get_typical_layer(latent_to_hidden3, 300, 500) hidden2_to_output = get_typical_layer(latent_to_hidden2, 500, 784, Logistic()) hidden2_to_output.name = "last_before_output" from blocks.bricks.cost import SquaredError, AbsoluteError, BinaryCrossEntropy from blocks.graph import ComputationGraph from blocks.algorithms import Adam, GradientDescent, Scale from blocks.roles import WEIGHT cost = BinaryCrossEntropy(name="error").apply(x, hidden2_to_output) cg = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT]) (cg.variables) # cost += 0.0001 * tensor.sum(map(lambda x: (x**2).sum(), weights)) # cost.name = "regularized error" gd = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) from blocks.main_loop import MainLoop from blocks.extensions import FinishAfter, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring monitor = TrainingDataMonitoring([cost], after_epoch=True) main_loop = MainLoop(data_stream=get_data_stream(), algorithm=gd, extensions=[monitor, FinishAfter(after_n_epochs=5), ProgressBar(), Printing()]) main_loop.run() showcase(cg, "last_before_output")
def __init__(self, params, feature_source, input_dim): super(MaxoutMLP, self).__init__(params) self.x = tensor.matrix(feature_source, dtype='float32') self.y = tensor.matrix('genres', dtype='int32') mlp = MLPGenreClassifier(input_dim, self.params['n_classes'], self.params['hidden_size'], self.params['init_ranges']) mlp.initialize() with batch_normalization(mlp): self.y_hat = mlp.apply(self.x) self.cost = BinaryCrossEntropy().apply(self.y, self.y_hat)
def __init__(self, params): super(MoETrainer, self).__init__(params) self.x_v = tensor.matrix('vgg_features', dtype='float32') self.x_t = tensor.matrix('features', dtype='float32') self.y = tensor.matrix('genres', dtype='int32') model = MoEClassifier(params['visual_dim'], params['textual_dim'], params['n_classes'], params['hidden_size'], params['init_ranges']) model.initialize() with batch_normalization(model): self.y_hat = model.apply(self.x_v, self.x_t) self.cost = BinaryCrossEntropy().apply(self.y, self.y_hat)
def __init__(self, params): super(ConcatenateTrainer, self).__init__(params) x_v = tensor.matrix('vgg_features', dtype='float32') x_t = tensor.matrix('features', dtype='float32') self.x = tensor.concatenate([x_v, x_t], axis=1) self.y = tensor.matrix('genres', dtype='int32') input_dim = params['visual_dim'] + params['textual_dim'] mlp = MLPGenreClassifier(input_dim, self.params['n_classes'], self.params['hidden_size'], self.params['init_ranges']) mlp.initialize() with batch_normalization(mlp): self.y_hat = mlp.apply(self.x) self.cost = BinaryCrossEntropy().apply(self.y, self.y_hat)
def create_training_computation_graphs(): x = tensor.tensor4('features') y = tensor.imatrix('targets') convnet, mlp = create_model_bricks() y_hat = mlp.apply(convnet.apply(x).flatten(ndim=2)) cost = BinaryCrossEntropy().apply(y, y_hat) accuracy = 1 - tensor.neq(y > 0.5, y_hat > 0.5).mean() cg = ComputationGraph([cost, accuracy]) # Create a graph which uses batch statistics for batch normalization # as well as dropout on selected variables bn_cg = apply_batch_normalization(cg) bricks_to_drop = ([convnet.layers[i] for i in (5, 11, 17)] + [mlp.application_methods[1].brick]) variables_to_drop = VariableFilter( roles=[OUTPUT], bricks=bricks_to_drop)(bn_cg.variables) bn_dropout_cg = apply_dropout(bn_cg, variables_to_drop, 0.5) return cg, bn_dropout_cg
def build_autoencoder(features, labels_num, labels_cat): mlp_bottom = MLP(activations=[ Rectifier(), Rectifier(), Rectifier(), Rectifier(), Rectifier() ], dims=[24033, 5000, 1000, 100, 1000, 5000], weights_init=IsotropicGaussian(), biases_init=Constant(1)) mlp_bottom.initialize() mlp_top = build_top_mlp() mlp_top.push_initialization_config() mlp_top.initialize() # a = mlp_bottom.apply(features) # b = mlp_top.apply(a) # Construct feedforward sequence ss_seq = Sequence([mlp_bottom.apply, mlp_top.apply]) ss_seq.push_initialization_config() ss_seq.initialize() [outputs_numerical, outputs_categorical] = ss_seq.apply(features) cost = SquaredError().apply( labels_num, outputs_numerical) + BinaryCrossEntropy().apply( labels_cat, outputs_categorical) cg = ComputationGraph(cost) #cg_dropout0 = apply_dropout(cg, [VariableFilter(roles=[INPUT])(cg.variables)[1]], .2) #cg_dropout1 = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[1], VariableFilter(roles=[OUTPUT])(cg.variables)[3]], .2) #cost_dropout1 = cg_dropout1.outputs[0] return cost, cg.parameters
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel): image_size, data_train, data_valid, data_test = datasets.get_data(dataset) train_stream = Flatten( DataStream(data_train, iteration_scheme=SequentialScheme(data_train.num_examples, batch_size))) valid_stream = Flatten( DataStream(data_valid, iteration_scheme=SequentialScheme(data_valid.num_examples, batch_size))) test_stream = Flatten( DataStream(data_test, iteration_scheme=SequentialScheme(data_test.num_examples, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = time.strftime("%Y%m%d-%H%M%S") + "-" + name longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % ( dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), Checkpoint(name, before_training=False, after_epoch=True, save_separately=['log', 'model']), #Checkpoint(image_size=image_size, save_subdir=subdir, path=pickle_file, before_training=False, after_epoch=True, save_separately=['log', 'model']), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): if name is None: tag = "watt" if attention else "woatt" name = "%s-t%d-enc%d-dec%d-z%d" % (tag, n_iter, enc_dim, dec_dim, z_dim) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #------------------------------------------------------------------------ x_dim = 28 * 28 img_height, img_width = (28, 28) rnninits = { 'weights_init': Orthogonal(), #'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } inits = { 'weights_init': Orthogonal(), #'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } prior_mu = T.zeros([z_dim]) prior_log_sigma = T.zeros([z_dim]) if attention: read_N = 4 write_N = 6 read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=read_N, **inits) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) encoder = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Tanh()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Tanh()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) for brick in [ reader, writer, encoder, decoder, encoder_mlp, decoder_mlp, q_sampler ]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') # This is one iteration def one_iteration(c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec, x): x_hat = x - T.nnet.sigmoid(c) r = reader.apply(x, x_hat, h_dec) i_enc = encoder_mlp.apply(T.concatenate([r, h_dec], axis=1)) h_enc, c_enc = encoder.apply(states=h_enc, cells=c_enc, inputs=i_enc, iterate=False) z_mean, z_log_sigma, z = q_sampler.apply(h_enc) i_dec = decoder_mlp.apply(z) h_dec, c_dec = decoder.apply(states=h_dec, cells=c_dec, inputs=i_dec, iterate=False) c = c + writer.apply(h_dec) return c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec outputs_info = [ T.zeros([batch_size, x_dim]), # c T.zeros([batch_size, enc_dim]), # h_enc T.zeros([batch_size, enc_dim]), # c_enc T.zeros([batch_size, z_dim]), # z_mean T.zeros([batch_size, z_dim]), # z_log_sigma T.zeros([batch_size, z_dim]), # z T.zeros([batch_size, dec_dim]), # h_dec T.zeros([batch_size, dec_dim]), # c_dec ] outputs, scan_updates = theano.scan(fn=one_iteration, sequences=[], outputs_info=outputs_info, non_sequences=[x], n_steps=n_iter) c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec = outputs kl_terms = (prior_log_sigma - z_log_sigma + 0.5 * (tensor.exp(2 * z_log_sigma) + (z_mean - prior_mu)**2) / tensor.exp(2 * prior_log_sigma) - 0.5).sum(axis=-1) x_recons = T.nnet.sigmoid(c[-1, :, :]) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ #StepClipping(3.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t x_recons_t = T.nnet.sigmoid(c[t, :, :]) recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) recons_term_t = recons_term_t.mean() recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t, recons_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], ["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"]] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=None, data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), updates=scan_updates, prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset( generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset( generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[ test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar() ]) main_loop.run() print('Learned weights:') for layer in (x_to_h, lstm, h_to_o): print("Layer '%s':" % layer.name) for param in layer.parameters: print(param.name, ': ', param.get_value()) print() return main_loop
#4096, 128, #4096, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=Constant(W_b)) # Don't forget to initialize params: autoencoder.initialize() # Reconstruction layer: x_hat = Linear(input_dim=128, output_dim=input_dim[side]).apply(autoencoder.apply(x)) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = BinaryCrossEntropy().apply(x, x_hat) # This is the model: before applying dropout autoencoder = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg
from hinton import hinton x = tensor.matrix('x') y = tensor.lmatrix('y') mlp = MLP(activations=[Logistic(), Softmax()], dims=[117, 55, 2], weights_init=IsotropicGaussian(), biases_init=Constant(0.01)) mlp.initialize() y_hat = mlp.apply(x) cost = BinaryCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.001 * abs(W1).sum() + 0.001 * abs(W2).sum() cost.name = 'cost' error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) error_rate.name = 'error_rate' algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) train_set = H5PYDataset('mushrooms.hdf5', which_sets=('train', ))
def main(name, epochs, batch_size, learning_rate): if name is None: name = "att-rw" print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print() #------------------------------------------------------------------------ img_height, img_width = 28, 28 read_N = 12 write_N = 14 inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } x_dim = img_height * img_width reader = ZoomableAttentionWindow(img_height, img_width, read_N) writer = ZoomableAttentionWindow(img_height, img_width, write_N) # Parameterize the attention reader and writer mlpr = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="RMLP", **inits) mlpw = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="WMLP", **inits) # MLP between the reader and writer mlp = MLP(activations=[Tanh(), Identity()], dims=[read_N**2, 300, write_N**2], name="MLP", **inits) for brick in [mlpr, mlpw, mlp]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') hr = mlpr.apply(x) hw = mlpw.apply(x) center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) r = reader.read(x, center_y, center_x, delta, sigma) h = mlp.apply(r) center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) c = writer.write(h, center_y, center_x, delta, sigma) / gamma x_recons = T.nnet.sigmoid(c) cost = BinaryCrossEntropy().apply(x, x_recons) cost.name = "cost" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ RemoveNotFinite(), Adam(learning_rate), StepClipping(3.), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: # v_mean = v.mean() # v_mean.name = v.name # monitors += [v_mean] # monitors += [aggregation.mean(v)] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["cost"], ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), #Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def create_network(inputs=None, batch=batch_size): if inputs is None: inputs = T.tensor4('features') x = T.cast(inputs, 'float32') x = x / 255. if dataset != 'binarized_mnist' else x # GatedPixelCNN gated = GatedPixelCNN(name='gated_layer_0', filter_size=7, image_size=(img_dim, img_dim), num_filters=h * n_channel, num_channels=n_channel, batch_size=batch, weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), res=False) gated.initialize() x_v, x_h = gated.apply(x, x) for i in range(n_layer): gated = GatedPixelCNN(name='gated_layer_{}'.format(i + 1), filter_size=3, image_size=(img_dim, img_dim), num_channels=h * n_channel, batch_size=batch, weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), res=True) gated.initialize() x_v, x_h = gated.apply(x_v, x_h) conv_list = [] conv_list.extend([ Rectifier(), ConvolutionalNoFlip((1, 1), h * n_channel, mask_type='B', name='1x1_conv_1') ]) #conv_list.extend([Rectifier(), ConvolutionalNoFlip((1,1), h*n_channel, mask='B', name='1x1_conv_2')]) conv_list.extend([ Rectifier(), ConvolutionalNoFlip(*third_layer, mask_type='B', name='output_layer') ]) sequence = ConvolutionalSequence(conv_list, num_channels=h * n_channel, batch_size=batch, image_size=(img_dim, img_dim), border_mode='half', weights_init=IsotropicGaussian(std=0.02, mean=0), biases_init=Constant(0.02), tied_biases=False) sequence.initialize() x = sequence.apply(x_h) if MODE == '256ary': x = x.reshape( (-1, 256, n_channel, img_dim, img_dim)).dimshuffle(0, 2, 3, 4, 1) x = x.reshape((-1, 256)) x_hat = Softmax().apply(x) inp = T.cast(inputs, 'int64').flatten() cost = CategoricalCrossEntropy().apply(inp, x_hat) * img_dim * img_dim cost_bits_dim = categorical_crossentropy(log_softmax(x), inp) else: x_hat = Logistic().apply(x) cost = BinaryCrossEntropy().apply(inputs, x_hat) * img_dim * img_dim #cost = T.nnet.binary_crossentropy(x_hat, inputs) #cost = cost.sum() / inputs.shape[0] cost_bits_dim = -(inputs * T.log2(x_hat) + (1.0 - inputs) * T.log2(1.0 - x_hat)).mean() cost_bits_dim.name = "nnl_bits_dim" cost.name = 'loglikelihood_nat' return cost, cost_bits_dim
def create_network(inputs=None, batch=batch_size): if inputs is None: inputs = T.tensor4('features') x = T.cast(inputs, 'float32') x = x / 255. if dataset != 'binarized_mnist' else x # PixelCNN architecture conv_list = [ ConvolutionalNoFlip(*first_layer, mask='A', name='0'), Rectifier() ] for i in range(n_layer): conv_list.extend([ ConvolutionalNoFlip(*second_layer, mask='B', name=str(i + 1)), Rectifier() ]) conv_list.extend([ ConvolutionalNoFlip((1, 1), h * n_channel, mask='B', name=str(n_layer + 1)), Rectifier() ]) conv_list.extend([ ConvolutionalNoFlip((1, 1), h * n_channel, mask='B', name=str(n_layer + 2)), Rectifier() ]) conv_list.extend( [ConvolutionalNoFlip(*third_layer, mask='B', name=str(n_layer + 3))]) sequence = ConvolutionalSequence(conv_list, num_channels=n_channel, batch_size=batch, image_size=(img_dim, img_dim), border_mode='half', weights_init=IsotropicGaussian(std=0.05, mean=0), biases_init=Constant(0.02), tied_biases=False) sequence.initialize() x = sequence.apply(x) if MODE == '256ary': x = x.reshape( (-1, 256, n_channel, img_dim, img_dim)).dimshuffle(0, 2, 3, 4, 1) x = x.reshape((-1, 256)) x_hat = Softmax().apply(x) inp = T.cast(inputs, 'int64').flatten() cost = CategoricalCrossEntropy().apply(inp, x_hat) * img_dim * img_dim cost_bits_dim = categorical_crossentropy(log_softmax(x), inp) else: x_hat = Logistic().apply(x) cost = BinaryCrossEntropy().apply(inputs, x_hat) * img_dim * img_dim #cost = T.nnet.binary_crossentropy(x_hat, inputs) #cost = cost.sum() / inputs.shape[0] cost_bits_dim = -(inputs * T.log2(x_hat) + (1.0 - inputs) * T.log2(1.0 - x_hat)).mean() cost_bits_dim.name = "nnl_bits_dim" cost.name = 'loglikelihood_nat' return cost, cost_bits_dim
def train(self): x = self.sharedBatch['x'] x.name = 'x_myinput' x_mask = self.sharedBatch['x_mask'] x_mask.name = 'x_mask_myinput' y = self.sharedBatch['y'] y.name = 'y_myinput' if self.usePro: proportion = self.sharedBatch['pro'] proportion.name = 'pro' # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(self.input_dimx1, self.dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(self.dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(self.dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform, mask=x_mask) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) if self.usePro: cost = BinaryCrossEntropyProp().apply(y, y_hat, proportion) else: cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() self.f = theano.function(inputs=[], outputs=y_hat) self.lastH = theano.function(inputs=[], outputs=h[-1]) self.cg = ComputationGraph(cost) m = Model(cost) algorithm = GradientDescent(cost=cost, parameters=self.cg.parameters, step_rule=RMSProp(learning_rate=0.01), on_unused_sources='ignore') valid_monitor = DataStreamMonitoringShared( variables=[cost], data_stream=self.stream_valid_int, prefix="valid", sharedBatch=self.sharedBatch, sharedData=self.sharedData) train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) sharedVarMonitor = SwitchSharedReferences(self.sharedBatch, self.sharedData) tBest = self.track_best('valid_cost', self.cg) self.tracker = tBest[0] extensions = [sharedVarMonitor, valid_monitor] + tBest if self.debug: extensions.append(Printing()) self.algorithm = algorithm self.extensions = extensions self.model = m self.mainloop = MainLoop(self.algorithm, self.stream_train_int, extensions=self.extensions, model=self.model) self.main_loop(True)
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e"%value)[0] return "%s%d" % (leading, -exp) if name is None: tag = "watt" if attention else "woatt" lr_str = lr_tag(learning_rate) name = "%s-t%d-enc%d-dec%d-z%d-lr%s" % (tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #------------------------------------------------------------------------ x_dim = 28*28 img_height, img_width = (28, 28) rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention: read_N = 4 write_N = 7 read_dim = 2*read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=read_N, **inits) else: read_dim = 2*x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Tanh()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Tanh()], [ z_dim, 4*dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel( n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(3.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] """ for t in range(n_iter): kl_term_t = kl_terms[t,:].mean() kl_term_t.name = "kl_term_%d" % t x_recons_t = T.nnet.sigmoid(c[t,:,:]) recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) recons_term_t = recons_term_t.mean() recons_term_t.name = "recons_term_%d" % t monitors +=[kl_term_t, recons_term_t] """ train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], ["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX(DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX(DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), ## updates=scan_updates, prefix="test"), TrainingDataMonitoring( train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name+".pkl"), Plot(name, channels=plot_channels), ProgressBar(), Printing()]) main_loop.run()
def train(self): #print(self.sharedBatch.keys()) x = self.sharedBatch['x'] x.name = 'x_myinput' xmask = self.sharedBatch['xmask'] xmask.name = 'xmask_myinput' xmini = self.sharedBatch['xmini'] xmini.name = 'xmini_myinput' xmini_mask = self.sharedBatch['xmini_mask'] xmini_mask.name = 'xmini_mask_myinput' y = self.sharedBatch['y'] y.name = 'y_myinput' xattr = self.sharedBatch['xattr'] xattr.name = 'xattr_myinput' # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(self.input_dimx, self.dim, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) xmini_to_h = Linear(self.input_dimxmini, self.mini_dim * 4, name='xmini_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) comb_to_h = Linear(self.input_dimxattr + self.summary_dim, self.input_dimxattr + self.summary_dim, name='comb_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstmwmini = LSTMwMini(dim=self.dim, mini_dim=self.mini_dim, summary_dim=self.summary_dim) mlp = MLP( activations=[Rectifier()], dims=[self.summary_dim + self.input_dimxattr, self.summary_dim], weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(self.summary_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) xmini_transform = xmini_to_h.apply(xmini) h, c = lstmwmini.apply(x=x_transform, xmini=xmini_transform, xmask=xmask, xmini_mask=xmini_mask) attr_and_rnn = T.concatenate([xattr, h[-1]], axis=1) #self.f = theano.function(inputs = [], outputs=attr_and_rnn) #print(self.summary_dim) #print(self.input_dimx) #print("self.f === ") #print(self.f()) #print(self.f().shape) #print("====") comb_transform = comb_to_h.apply(attr_and_rnn) mlp_transform = mlp.apply(comb_transform) # only values of hidden units of the last timeframe are used for # the classification #y_hat = h_to_o.apply(h[-1]) y_hat = h_to_o.apply(mlp_transform) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstmwmini.initialize() x_to_h.initialize() xmini_to_h.initialize() comb_to_h.initialize() mlp.initialize() h_to_o.initialize() self.f = theano.function(inputs=[], outputs=y_hat) #print("self.f === ") #print(self.f()) #print(self.f().shape) #print("====") self.cg = ComputationGraph(cost) m = Model(cost) algorithm = GradientDescent(cost=cost, parameters=self.cg.parameters, step_rule=RMSProp(learning_rate=0.01), on_unused_sources='ignore') valid_monitor = DataStreamMonitoringShared( variables=[cost], data_stream=self.stream_valid_int, prefix="valid", sharedBatch=self.sharedBatch, sharedData=self.sharedData) train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) sharedVarMonitor = SwitchSharedReferences(self.sharedBatch, self.sharedData) tBest = self.track_best('valid_cost', self.cg) self.tracker = tBest[0] extensions = [sharedVarMonitor, valid_monitor] + tBest if self.debug: extensions.append(Printing()) self.algorithm = algorithm self.extensions = extensions self.model = m self.mainloop = MainLoop(self.algorithm, self.stream_train_int, extensions=self.extensions, model=self.model) self.main_loop(True)