def demo_compare_dtp_optimizers( hidden_sizes=[240], n_epochs=10, minibatch_size=20, n_tests=20, hidden_activation='tanh', ): dataset = get_mnist_dataset(flat=True).to_onehot() if is_test_mode(): dataset = dataset.shorten(200) n_epochs = 1 n_tests = 2 def make_dtp_net(optimizer_constructor, output_fcn): return DifferenceTargetMLP.from_initializer( input_size=dataset.input_size, output_size=dataset.target_size, hidden_sizes=hidden_sizes, optimizer_constructor=optimizer_constructor, input_activation='sigm', hidden_activation=hidden_activation, output_activation=output_fcn, w_init_mag=0.01, noise=1, ).compile() learning_curves = compare_predictors( dataset=dataset, online_predictors={ 'SGD-0.001-softmax': make_dtp_net(lambda: SimpleGradientDescent(0.001), output_fcn='softmax'), 'AdaMax-0.001-softmax': make_dtp_net(lambda: AdaMax(0.001), output_fcn='softmax'), 'RMSProp-0.001-softmax': make_dtp_net(lambda: RMSProp(0.001), output_fcn='softmax'), 'SGD-0.001-sigm': make_dtp_net(lambda: SimpleGradientDescent(0.001), output_fcn='sigm'), 'AdaMax-0.001-sigm': make_dtp_net(lambda: AdaMax(0.001), output_fcn='sigm'), 'RMSProp-0.001-sigm': make_dtp_net(lambda: RMSProp(0.001), output_fcn='sigm'), }, minibatch_size=minibatch_size, test_epochs=sqrtspace(0, n_epochs, n_tests), evaluation_function=percent_argmax_correct, ) plot_learning_curves(learning_curves)
def test_variational_autoencoder(): """ Just test that after training, samples are closer to the test data than they are before training. """ dataset = get_synthetic_clusters_dataset() rng = np.random.RandomState(1234) model = VariationalAutoencoder(pq_pair=EncoderDecoderNetworks( x_dim=dataset.input_shape[0], z_dim=2, encoder_hidden_sizes=[], decoder_hidden_sizes=[], w_init=lambda n_in, n_out: 0.01 * rng.randn(n_in, n_out), ), optimizer=AdaMax(alpha=0.1), rng=rng) train_fcn = model.train.compile() gen_fcn = model.sample.compile() initial_mcm = mean_closest_match(gen_fcn(100), dataset.test_set.input, 'L1') for minibatch in minibatch_iterate(dataset.training_set.input, minibatch_size=10, n_epochs=1): train_fcn(minibatch) final_mcm = mean_closest_match(gen_fcn(100), dataset.test_set.input, 'L1') assert final_mcm < initial_mcm / 2
def mnist_adamax_showdown(hidden_size=300, n_epochs=10, n_tests=20): dataset = get_mnist_dataset() if is_test_mode(): dataset = dataset.shorten(200) n_epochs = 0.1 n_tests = 3 make_mlp = lambda optimizer: GradientBasedPredictor( function=MultiLayerPerceptron.from_init(layer_sizes=[ dataset.input_size, hidden_size, dataset.n_categories ], hidden_activation='sig', output_activation='lin', w_init=0.01, rng=5), cost_function=softmax_negative_log_likelihood, optimizer=optimizer, ).compile() return compare_predictors(dataset=dataset, online_predictors={ 'sgd': make_mlp(SimpleGradientDescent(eta=0.1)), 'adamax': make_mlp(AdaMax(alpha=1e-3)), }, minibatch_size=20, test_epochs=sqrtspace(0, n_epochs, n_tests), evaluation_function=percent_argmax_correct)
def demo_rbm_mnist( vis_activation = 'bernoulli', hid_activation = 'bernoulli', n_hidden = 500, plot = True, eta = 0.01, optimizer = 'sgd', w_init_mag = 0.001, minibatch_size = 9, persistent = False, n_epochs = 100, plot_interval = 100, ): """ In this demo we train an RBM on the MNIST input data (labels are ignored). We plot the state of a markov chanin that is being simulaniously sampled from the RBM, and the parameters of the RBM. What you see: A plot will appear with 6 subplots. The subplots are as follows: hidden-neg-chain: The activity of the hidden layer for each of the persistent CD chains for draewing negative samples. visible-neg-chain: The probabilities of the visible activations corresponding to the state of hidden-neg-chain. w: A subset of the weight vectors, reshaped to the shape of the input. b: The bias of the hidden units. b_rev: The bias of the visible units. visible-sample: The probabilities of the visible samples drawin from an independent free-sampling chain (outside the training function). As learning progresses, visible-neg-chain and visible-sample should increasingly resemble the data. """ with EnableOmniscence(): # EnableOmniscence allows us to plot internal variables (by referencing the .locals() attribute of a symbolic function.. see plot_fcn below) if is_test_mode(): n_epochs = 0.01 data = get_mnist_dataset(flat = True).training_set.input rbm = simple_rbm( visible_layer = StochasticNonlinearity(vis_activation), bridge=FullyConnectedBridge(w = w_init_mag*np.random.randn(28*28, n_hidden).astype(theano.config.floatX), b=0, b_rev = 0), hidden_layer = StochasticNonlinearity(hid_activation) ) optimizer = \ SimpleGradientDescent(eta = eta) if optimizer == 'sgd' else \ AdaMax(alpha=eta) if optimizer == 'adamax' else \ bad_value(optimizer) train_function = rbm.get_training_fcn(n_gibbs = 1, persistent = persistent, optimizer = optimizer).compile() def plot_fcn(): lv = train_function.locals() dbplot(lv['wake_visible'].reshape((-1, 28, 28)), 'visible-pos-chain') dbplot(lv['sleep_visible'].reshape((-1, 28, 28)), 'visible-neg-chain') for i, visible_data in enumerate(minibatch_iterate(data, minibatch_size=minibatch_size, n_epochs=n_epochs)): train_function(visible_data) if plot and i % plot_interval == 0: plot_fcn()
def demo_lstm_novelist( book = 'bible', n_hidden = 400, verse_duration = 20, generation_duration = 200, generate_every = 200, max_len = None, hidden_layer_type = 'tanh', n_epochs = 1, seed = None, ): """ An LSTM-Autoencoder learns the Bible, and can spontaniously produce biblical-ish verses. :param n_hidden: Number of hidden/memory units in LSTM :param verse_duration: Number of Backprop-Through-Time steps to do. :param generation_duration: Number of characters to generate with each sample. :param generate_every: Generate every N training iterations :param max_len: Truncate the text to this length. :param n_epochs: Number of passes through the bible to make. :param seed: Random Seed :return: """ if is_test_mode(): n_hidden=10 verse_duration=7 generation_duration=5 max_len = 40 rng = np.random.RandomState(seed) text = read_book(book, max_characters=max_len) onehot_text, decode_key = text_to_onehot(text) n_char = onehot_text.shape[1] the_prophet = AutoencodingLSTM(n_input=n_char, n_hidden=n_hidden, initializer_fcn=lambda shape: 0.01*rng.randn(*shape), hidden_layer_type = hidden_layer_type) training_fcn = the_prophet.get_training_function(optimizer=AdaMax(alpha = 0.01), update_states=True).compile(add_test_values = True) generating_fcn = the_prophet.get_generation_function(stochastic=True).compile(add_test_values = True) printer = TextWrappingPrinter(newline_every=100) def prime_and_generate(n_steps, primer = ''): onehot_primer, _ = text_to_onehot(primer, decode_key) onehot_gen, = generating_fcn(onehot_primer, n_steps) gen = onehot_to_text(onehot_gen, decode_key) return '%s%s' % (primer, gen) prime_and_generate(generation_duration, 'In the beginning, ') for i, verse in enumerate(minibatch_iterate(onehot_text, minibatch_size=verse_duration, n_epochs=n_epochs)): if i % generate_every == 0: printer.write('[iter %s]%s' % (i, prime_and_generate(n_steps = generation_duration), )) training_fcn(verse) printer.write('[iter %s]%s' % (i, prime_and_generate(n_steps = generation_duration), ))
def backprop_vs_difference_target_prop(hidden_sizes=[240], n_epochs=10, minibatch_size=20, n_tests=20): dataset = get_mnist_dataset(flat=True) dataset = dataset.process_with( targets_processor=lambda (x, ): (OneHotEncoding(10)(x).astype(int), )) if is_test_mode(): dataset = dataset.shorten(200) n_epochs = 0.1 n_tests = 3 set_default_figure_size(12, 9) return compare_predictors( dataset=dataset, online_predictors={ 'backprop-mlp': GradientBasedPredictor( function=MultiLayerPerceptron.from_init( layer_sizes=[dataset.input_size] + hidden_sizes + [dataset.n_categories], hidden_activation='tanh', output_activation='sig', w_init=0.01, rng=5), cost_function=mean_squared_error, optimizer=AdaMax(0.01), ).compile(), 'difference-target-prop-mlp': DifferenceTargetMLP.from_initializer( input_size=dataset.input_size, output_size=dataset.target_size, hidden_sizes=hidden_sizes, optimizer_constructor=lambda: AdaMax(0.01), w_init=0.01, noise=1, ).compile() }, minibatch_size=minibatch_size, test_epochs=sqrtspace(0, n_epochs, n_tests), evaluation_function=percent_argmax_correct, )
def __init__(self, pq_pair, optimizer=AdaMax(alpha=0.01), rng=None): """ :param pq_pair: An IVeriationalPair object :param optimizer: An IGradientOptimizer object :param rng: A random number generator, or seed. """ self.rng = get_theano_rng(rng) self.pq_pair = pq_pair self.optimizer = optimizer
def test_convnet_serialization(): cifar10 = get_cifar_10_dataset(normalize_inputs=True, n_training_samples=50, n_test_samples=50) test_epochs = [0, 1, 2] assert cifar10.input_shape == (3, 32, 32) net = ConvNet.from_init( input_shape=cifar10.input_shape, w_init=0.01, specifiers=[ ConvInitSpec(n_maps=24, filter_size=(3, 3), mode='same'), NonlinearitySpec('relu'), PoolerSpec(region=2, stride=2, mode='max'), # (16x16) ConvInitSpec(n_maps=48, filter_size=(3, 3), mode='same'), NonlinearitySpec('relu'), PoolerSpec(region=2, stride=2, mode='max'), # (8x8) ConvInitSpec(n_maps=96, filter_size=(3, 3), mode='same'), NonlinearitySpec('relu'), PoolerSpec(region=2, stride=2, mode='max'), # (4x4), ConvInitSpec(n_maps=192, filter_size=(4, 4), mode='valid'), # (1x1) NonlinearitySpec('relu'), ConvInitSpec(n_maps=10, filter_size=(1, 1), mode='valid'), NonlinearitySpec('softmax'), ], ) predictor = GradientBasedPredictor( function=net, cost_function=negative_log_likelihood_dangerous, optimizer=AdaMax()) assess_online_symbolic_predictor( predictor=predictor, dataset=cifar10, evaluation_function=percent_argmax_correct, test_epochs=test_epochs, minibatch_size=20, add_test_values=False) results_1 = net.compile()(cifar10.test_set.input) savable = net.to_spec() serialized = pickle.dumps(savable) deserialized = pickle.loads(serialized) net_2 = ConvNet.from_init(deserialized, input_shape=cifar10.input_shape, rng=None) results_2 = net_2.compile()(cifar10.test_set.input) assert np.array_equal(results_1, results_2)
def demo_gan_mnist(n_epochs=20, minibatch_size=20, n_discriminator_steps=1, noise_dim=10, plot_period=100, rng=1234): """ Train a Generative Adversarial network on MNIST data, showing generated samples as training progresses. :param n_epochs: Number of epochs to train :param minibatch_size: Size of minibatch to feed in each training iteration :param n_discriminator_steps: Number of steps training discriminator for every step of training generator :param noise_dim: Dimensionality of latent space (from which random samples are pulled) :param plot_period: Plot every N training iterations :param rng: Random number generator or seed """ net = GenerativeAdversarialNetwork( discriminator=MultiLayerPerceptron.from_init(w_init=0.01, layer_sizes=[784, 100, 1], hidden_activation='relu', output_activation='sig', rng=rng), generator=MultiLayerPerceptron.from_init( w_init=0.1, layer_sizes=[noise_dim, 200, 784], hidden_activation='relu', output_activation='sig', rng=rng), noise_dim=noise_dim, optimizer=AdaMax(0.001), rng=rng) data = get_mnist_dataset(flat=True).training_set.input f_train_discriminator = net.train_discriminator.compile() f_train_generator = net.train_generator.compile() f_generate = net.generate.compile() for i, minibatch in enumerate( minibatch_iterate(data, n_epochs=n_epochs, minibatch_size=minibatch_size)): f_train_discriminator(minibatch) print 'Trained Discriminator' if i % n_discriminator_steps == n_discriminator_steps - 1: f_train_generator(n_samples=minibatch_size) print 'Trained Generator' if i % plot_period == 0: samples = f_generate(n_samples=minibatch_size) dbplot(minibatch.reshape(-1, 28, 28), "Real") dbplot(samples.reshape(-1, 28, 28), "Counterfeit") print 'Disp'
def demo_variational_autoencoder(minibatch_size=100, n_epochs=2000, plot_interval=100, seed=None): """ Train a Variational Autoencoder on MNIST and look at the samples it generates. :param minibatch_size: Number of elements in the minibatch :param n_epochs: Number of passes through dataset :param plot_interval: Plot every x iterations """ data = get_mnist_dataset(flat=True).training_set.input if is_test_mode(): n_epochs = 1 minibatch_size = 10 data = data[:100] rng = get_rng(seed) model = VariationalAutoencoder(pq_pair=EncoderDecoderNetworks( x_dim=data.shape[1], z_dim=20, encoder_hidden_sizes=[200], decoder_hidden_sizes=[200], w_init=lambda n_in, n_out: 0.01 * np.random.randn(n_in, n_out), x_distribution='bernoulli', z_distribution='gaussian', hidden_activation='softplus'), optimizer=AdaMax(alpha=0.003), rng=rng) training_fcn = model.train.compile() sampling_fcn = model.sample.compile() for i, minibatch in enumerate( minibatch_iterate(data, minibatch_size=minibatch_size, n_epochs=n_epochs)): training_fcn(minibatch) if i % plot_interval == 0: print 'Epoch %s' % (i * minibatch_size / float(len(data)), ) samples = sampling_fcn(25).reshape(5, 5, 28, 28) dbplot(samples, 'Samples from Model') dbplot( model.pq_pair.p_net.parameters[-2].get_value()[:25].reshape( -1, 28, 28), 'dec') dbplot( model.pq_pair.q_net.parameters[0].get_value().T[:25].reshape( -1, 28, 28), 'enc')
def test_autoencoding_lstm(width=8, seed=1234): data = get_bounce_data(width=width) encoder = OneHotEncoding(n_classes=width, dtype=theano.config.floatX) onehot_data = encoder(data) rng = np.random.RandomState(seed) aelstm = AutoencodingLSTM( n_input=8, n_hidden=50, initializer_fcn=lambda shape: 0.01 * rng.randn(*shape)) gen_fcn = aelstm.get_generation_function( maintain_state=True, rng=rng).compile(add_test_values=True) train_fcn = aelstm.get_training_function( update_states=True, optimizer=AdaMax(alpha=0.1)).compile(add_test_values=True) def prime_and_gen(primer, n_steps): onehot_primer = encoder(np.array(primer)) onehot_generated, = gen_fcn(onehot_primer, n_steps) generated = encoder.inverse(onehot_generated) return generated initial_seq = prime_and_gen([0, 1, 2, 3, 4], 11) print initial_seq # Test empty, one-length primers prime_and_gen([], 2) prime_and_gen([0], 2) print 'Training....' for d in minibatch_iterate(onehot_data, minibatch_size=3, n_epochs=400): train_fcn(d) print 'Done.' final_seq = prime_and_gen([0, 1, 2, 3, 4], 11) assert np.array_equal( final_seq, [5, 6, 7, 6, 5, 4, 3, 2, 1, 0, 1]), 'Bzzzz! It was %s' % (final_seq, ) # Assert state is maintained seq = prime_and_gen([], 3) assert np.array_equal(seq, [2, 3, 4]), 'Bzzzz! It was %s' % (seq, ) seq = prime_and_gen([5], 3) assert np.array_equal(seq, [6, 7, 6]), 'Bzzzz! It was %s' % (seq, ) # Assert training does not interrupt generation state. train_fcn(d) seq = prime_and_gen([], 3) assert np.array_equal(seq, [5, 4, 3]), 'Bzzzz! It was %s' % (seq, )
def get_training_function(self, cost_func=mean_xe, optimizer=AdaMax(alpha=1e-3), update_states=True): """ Get the symbolic function that will be used to train the AutoEncodingLSTM. :param cost_func: Function that takes actual outputs, target outputs and returns a cost. :param optimizer: Optimizer: takes cost, parameters, returns updates. :param update_states: If true, the hidden state is maintained between calls to the training function. This makes sense if your data is coming in sequentially. :return: """ @symbolic_updater def training_fcn(inputs): hidden_reps = self.lstm.multi_step(inputs, update_states=update_states) outputs = self.output_activation( hidden_reps.dot(self.w_hz) + self.b_z) cost = cost_func(actual=outputs[:-1], target=inputs[1:]) optimizer(cost=cost, parameters=self.parameters) return training_fcn
input_activation='relu', hidden_activation='relu', output_activation='relu', optimizer_constructor=lambda: RMSProp(learning_rate=0.001), ), description= "DTP with an entirely RELU network, using RMSprop as an optimizer", conclusion="RMSProp and RELU do not mix at all!") register_experiment( name='all-relu-dtp-adamax', function=lambda: demo_run_dtp_on_mnist( input_activation='relu', hidden_activation='relu', output_activation='relu', optimizer_constructor=lambda: AdaMax(alpha=0.001), ), description= "DTP with an entirely RELU network, using RMSprop as an optimizer", conclusion= "AdaMax and RELU do not mix well either! (not as horrible as RMSProp though)" ) register_experiment( name='all-relu-LinDTP', function=lambda: demo_run_dtp_on_mnist(input_activation='relu', hidden_activation='relu', output_activation='relu', optimizer_constructor=lambda: GradientDescent(eta=0.01), n_epochs=30,
def demo_simple_vae_on_mnist(minibatch_size=100, n_epochs=2000, plot_interval=100, calculation_interval=500, z_dim=2, hidden_sizes=[400, 200], learning_rate=0.003, hidden_activation='softplus', binary_x=True, w_init_mag=0.01, gaussian_min_var=None, manifold_grid_size=11, manifold_grid_span=2, seed=None): """ Train a Variational Autoencoder on MNIST and look at the samples it generates. """ dataset = get_mnist_dataset(flat=True) training_data = dataset.training_set.input test_data = dataset.test_set.input if is_test_mode(): n_epochs = 1 minibatch_size = 10 training_data = training_data[:100] test_data = test_data[:100] model = GaussianVariationalAutoencoder( x_dim=training_data.shape[1], z_dim=z_dim, encoder_hidden_sizes=hidden_sizes, decoder_hidden_sizes=hidden_sizes[::-1], w_init_mag=w_init_mag, binary_data=binary_x, hidden_activation=hidden_activation, optimizer=AdaMax(alpha=learning_rate), gaussian_min_var=gaussian_min_var, rng=seed) training_fcn = model.train.compile() # For display, make functions to sample and represent the manifold. sampling_fcn = model.sample.compile() z_manifold_grid = np.array([ x.flatten() for x in np.meshgrid( np.linspace(-manifold_grid_span, manifold_grid_span, manifold_grid_size), np.linspace(-manifold_grid_span, manifold_grid_span, manifold_grid_size)) ] + [np.zeros(manifold_grid_size**2)] * (z_dim - 2)).T decoder_mean_fcn = model.decode.compile(fixed_args=dict(z=z_manifold_grid)) lower_bound_fcn = model.compute_lower_bound.compile() for i, minibatch in enumerate( minibatch_iterate(training_data, minibatch_size=minibatch_size, n_epochs=n_epochs)): training_fcn(minibatch) if i % plot_interval == 0: samples = sampling_fcn(25).reshape(5, 5, 28, 28) dbplot(samples, 'Samples from Model') if binary_x: manifold_means = decoder_mean_fcn() else: manifold_means, _ = decoder_mean_fcn() dbplot( manifold_means.reshape(manifold_grid_size, manifold_grid_size, 28, 28), 'First 2-dimensions of manifold.') if i % calculation_interval == 0: training_lower_bound = lower_bound_fcn(training_data) test_lower_bound = lower_bound_fcn(test_data) print 'Epoch: %s, Training Lower Bound: %s, Test Lower bound: %s' % \ (i*minibatch_size/float(len(training_data)), training_lower_bound, test_lower_bound)
def __init__(self, x_dim, z_dim, encoder_hidden_sizes=[100], decoder_hidden_sizes=[100], hidden_activation='tanh', w_init_mag=0.01, binary_data=False, optimizer=AdaMax(alpha=0.01), rng=None, gaussian_min_var=None): """ :param x_dim: Dimensionsality of the data :param z_dim: Dimensionalality of the latent space :param encoder_hidden_sizes: A list of sizes of each hidden layer in the encoder (from X to Z) :param decoder_hidden_sizes: A list of sizes of each hidden layer in the dencoder (from Z to X) :param hidden_activation: Activation function for all hidden layers :param w_init_mag: Magnitude of initial weights :param binary_data: Chose if data is binary. You can also use this if data is bound in [0, 1] - then we can think of it as being the expected value. :param optimizer: An IGradientOptimizer object for doing parameter updates ... see plato.tools.optimization.optimizers :param rng: A random number generator or random seed. """ np_rng = get_rng(rng) encoder_layer_sizes = [x_dim] + encoder_hidden_sizes self.encoder_hidden_layers = [ Layer(w_init_mag * np_rng.randn(n_in, n_out), nonlinearity=hidden_activation) for n_in, n_out in zip( encoder_layer_sizes[:-1], encoder_layer_sizes[1:]) ] self.encoder_mean_layer = Layer( w_init_mag * np_rng.randn(encoder_layer_sizes[-1], z_dim), nonlinearity='linear') self.encoder_log_var_layer = Layer( w_init_mag * np_rng.randn(encoder_layer_sizes[-1], z_dim), nonlinearity='linear') decoder_layer_sizes = [z_dim] + decoder_hidden_sizes self.decoder_hidden_layers = [ Layer(w_init_mag * np_rng.randn(n_in, n_out), nonlinearity=hidden_activation) for n_in, n_out in zip( decoder_layer_sizes[:-1], decoder_layer_sizes[1:]) ] if binary_data: self.decoder_mean_layer = Layer( w_init_mag * np_rng.randn(decoder_layer_sizes[-1], x_dim), nonlinearity='sigm') else: self.decoder_mean_layer = Layer( w_init_mag * np_rng.randn(decoder_layer_sizes[-1], x_dim), nonlinearity='linear') self.decoder_log_var_layer = Layer( w_init_mag * np_rng.randn(decoder_layer_sizes[-1], x_dim), nonlinearity='linear') self.rng = get_theano_rng(np_rng) self.binary_data = binary_data self.x_size = x_dim self.z_size = z_dim self.optimizer = optimizer self.gaussian_min_var = gaussian_min_var
def test_adamax_optimizer(): _test_optimizer_on_simple_classification_problem(AdaMax(alpha=0.01))