def mnist_adamax_showdown(hidden_size = 300, n_epochs = 10, n_tests = 20): dataset = get_mnist_dataset() if is_test_mode(): dataset.shorten(200) n_epochs = 0.1 n_tests = 3 make_mlp = lambda optimizer: GradientBasedPredictor( function = MultiLayerPerceptron( layer_sizes=[hidden_size, dataset.n_categories], input_size = dataset.input_size, hidden_activation='sig', output_activation='lin', w_init = normal_w_init(mag = 0.01, seed = 5) ), cost_function = softmax_negative_log_likelihood, optimizer = optimizer, ).compile() return compare_predictors( dataset=dataset, online_predictors = { 'sgd': make_mlp(SimpleGradientDescent(eta = 0.1)), 'adamax': make_mlp(AdaMax(alpha = 1e-3)), }, minibatch_size = 20, test_epochs = sqrtspace(0, n_epochs, n_tests), evaluation_function = percent_argmax_correct )
def get_training_fcn(n_gibbs=1, persistent=False, optimizer=SimpleGradientDescent(eta=0.01)): @symbolic_updater def train(wake_visible): wake_hidden = propup(wake_visible) persistent_state = sleep_hidden = theano.shared( np.zeros(wake_hidden.tag.test_value.shape, dtype=theano.config.floatX), name='persistend_hidden_state') if persistent else wake_hidden for _ in xrange(n_gibbs): sleep_visible = propdown(sleep_hidden) sleep_hidden = propup(sleep_visible) wake_energy = bridge.free_energy( wake_visible) + hidden_layer.free_energy(bridge(wake_visible)) sleep_energy = bridge.free_energy( sleep_visible) + hidden_layer.free_energy( bridge(sleep_visible)) cost = tt.mean(wake_energy - sleep_energy) params = visible_layer.parameters + bridge.parameters + hidden_layer.parameters updates = optimizer(cost=cost, parameters=params, constants=[wake_visible, sleep_visible]) if persistent: updates.append((persistent_state, sleep_hidden)) return updates return train
def compare_example_predictors( n_epochs = 5, n_tests = 20, minibatch_size = 10, test_mode = False ): """ This demo shows how we can compare different online predictors. The demo trains both predictors on the dataset, returning an object that contains the results. :param test_mode: Set this to True to just run the demo quicky (but not to completion) to see that it doesn't break. """ dataset = get_mnist_dataset(flat = True) # "Flatten" the 28x28 inputs to a 784-d vector if test_mode: # Shorten the dataset so we run through it quickly in test mode. dataset = dataset.shorten(200) n_epochs = 1 n_tests = 3 # Here we compare three predictors on MNIST - an MLP, a Perceptron, and a Random Forest. # - The MLP is defined using Plato's interfaces - we create a Symbolic Predictor (GradientBasedPredictor) and # then compile it into an IPredictor object # - The Perceptron directly implements the IPredictor interface. # - The Random Forest implements SciKit learn's predictor interface - that is, it has a fit(x, y) and a predict(x) method. learning_curve_data = compare_predictors( dataset = dataset, online_predictors = { 'Perceptron': Perceptron( w = np.zeros((dataset.input_size, dataset.n_categories)), alpha = 0.001 ).to_categorical(n_categories = dataset.n_categories), # .to_categorical allows the perceptron to be trained on integer labels. 'MLP': GradientBasedPredictor( function = MultiLayerPerceptron( layer_sizes=[500, dataset.n_categories], input_size = dataset.input_size, hidden_activation='sig', # Sigmoidal hidden units output_activation='softmax', # Softmax output unit, since we're doing multinomial classification w_init = normal_w_init(mag = 0.01, seed = 5) ), cost_function = negative_log_likelihood_dangerous, # "Dangerous" because it doesn't check to see that output is normalized, but we know it is because it comes from softmax. optimizer = SimpleGradientDescent(eta = 0.1), ).compile(), # .compile() returns an IPredictor }, offline_predictors={ 'RF': RandomForestClassifier(n_estimators = 40) }, minibatch_size = minibatch_size, test_epochs = sqrtspace(0, n_epochs, n_tests), evaluation_function = percent_argmax_correct # Compares one-hot ) # Results is a LearningCurveData object return learning_curve_data
def mlp_normalization(hidden_size = 300, n_epochs = 30, n_tests = 50, minibatch_size=20): """ Compare mlps with different schemes for normalizing input. regular: Regular vanilla MLP normalize: Mean-subtract/normalize over minibatch normalize and scale: Mean-subtract/normalize over minibatch AND multiply by a trainable (per-unit) scale parameter. Conclusions: No significant benefit to scale parameter. Normalizing gives a head start but incurs a small cost later on. But really all classifiers are quite similar. :param hidden_size: Size of hidden layer """ dataset = get_mnist_dataset() if is_test_mode(): dataset.shorten(200) n_epochs = 0.1 n_tests = 3 make_mlp = lambda normalize, scale: GradientBasedPredictor( function = MultiLayerPerceptron( layer_sizes=[hidden_size, dataset.n_categories], input_size = dataset.input_size, hidden_activation='sig', output_activation='lin', normalize_minibatch=normalize, scale_param=scale, w_init = normal_w_init(mag = 0.01, seed = 5) ), cost_function = softmax_negative_log_likelihood, optimizer = SimpleGradientDescent(eta = 0.1), ).compile() return compare_predictors( dataset=dataset, online_predictors = { 'regular': make_mlp(normalize = False, scale = False), 'normalize': make_mlp(normalize=True, scale = False), 'normalize and scale': make_mlp(normalize=True, scale = True), }, minibatch_size = minibatch_size, test_epochs = sqrtspace(0, n_epochs, n_tests), evaluation_function = percent_argmax_correct )
def test_mlp(): assert_online_predictor_not_broken( predictor_constructor = lambda n_dim_in, n_dim_out: GradientBasedPredictor( function = MultiLayerPerceptron( layer_sizes = [100, n_dim_out], input_size = n_dim_in, output_activation='softmax', w_init = lambda n_in, n_out, rng = np.random.RandomState(3252): 0.1*rng.randn(n_in, n_out) ), cost_function=negative_log_likelihood_dangerous, optimizer=SimpleGradientDescent(eta = 0.1), ).compile(), categorical_target=True, minibatch_size=10, n_epochs=2 )
def demo_mnist_mlp(test_mode = False): """ Train an MLP on MNIST and print the test scores as training progresses. """ if test_mode: test_period = 200 minibatch_size = 5 n_epochs = 0.01 dataset = get_mnist_dataset(n_training_samples=30, n_test_samples=30) else: test_period = 1000 minibatch_size = 20 n_epochs = 10 dataset = get_mnist_dataset() # Setup the training and test functions classifier = MultiLayerPerceptron( layer_sizes=[500, 10], input_size = 784, hidden_activation='sig', output_activation='softmax', w_init = normal_w_init(mag = 0.01) ) training_cost_function = normalized_negative_log_likelihood optimizer = SimpleGradientDescent(eta = 0.1) training_function = SupervisedTrainingFunction(classifier, training_cost_function, optimizer).compile() test_cost_function = percent_correct test_function = SupervisedTestFunction(classifier, test_cost_function).compile() def report_test(i): training_cost = test_function(dataset.training_set.input, dataset.training_set.target) print 'Training score at iteration %s: %s' % (i, training_cost) test_cost = test_function(dataset.test_set.input, dataset.test_set.target) print 'Test score at iteration %s: %s' % (i, test_cost) # Train and periodically report the test score. print 'Running MLP on MNIST Dataset...' for i, (_, image_minibatch, label_minibatch) in enumerate(dataset.training_set.minibatch_iterator(minibatch_size = minibatch_size, epochs = n_epochs, single_channel = True)): if i % test_period == 0: report_test(i) training_function(image_minibatch, label_minibatch) report_test('Final') print '...Done.'
def test_param_serialization(): """ Pros - :return: """ dataset = get_synthetic_clusters_dataset() predictor_constructor = lambda: GradientBasedPredictor( function=MultiLayerPerceptron(layer_sizes=[100, dataset.n_categories], input_size=dataset.input_shape[0], output_activation='softmax', w_init=lambda n_in, n_out, rng=np.random. RandomState(3252): 0.1 * rng.randn( n_in, n_out)), cost_function=negative_log_likelihood_dangerous, optimizer=SimpleGradientDescent(eta=0.1), ).compile() evaluate = lambda pred: evaluate_predictor(pred, dataset.test_set, percent_argmax_correct) # Train up predictor and save params predictor = predictor_constructor() pre_training_score = evaluate(predictor) assert pre_training_score < 35 train_online_predictor(predictor, dataset.training_set, minibatch_size=20, n_epochs=3) post_training_score = evaluate(predictor) assert post_training_score > 95 trained_param_string = dumps_params(predictor) # Instantiate new predictor and load params new_predictor = predictor_constructor() new_pre_training_score = evaluate(new_predictor) assert new_pre_training_score < 35 loads_params(new_predictor, trained_param_string) loaded_score = evaluate(new_predictor) assert loaded_score == post_training_score > 95
def demo_dbn_mnist(plot=True, test_mode=True): """ In this demo we train an RBM on the MNIST input data (labels are ignored). We plot the state of a markov chanin that is being simulaniously sampled from the RBM, and the parameters of the RBM. """ set_enable_omniscence(True) minibatch_size = 20 dataset = get_mnist_dataset().process_with( inputs_processor=lambda (x, ): (x.reshape(x.shape[0], -1), )) w_init = lambda n_in, n_out: 0.01 * np.random.randn(n_in, n_out) n_training_epochs_1 = 20 n_training_epochs_2 = 20 check_period = 300 if test_mode: n_training_epochs_1 = 0.01 n_training_epochs_2 = 0.01 check_period = 100 dbn = DeepBeliefNet(layers={ 'vis': StochasticLayer('bernoulli'), 'hid': StochasticLayer('bernoulli'), 'ass': StochasticLayer('bernoulli'), 'lab': StochasticLayer('bernoulli'), }, bridges={ ('vis', 'hid'): FullyConnectedBridge(w=w_init(784, 500), b_rev=0), ('hid', 'ass'): FullyConnectedBridge(w=w_init(500, 500), b_rev=0), ('lab', 'ass'): FullyConnectedBridge(w=w_init(10, 500), b_rev=0) }) # Compile the functions you're gonna use. train_first_layer = dbn.get_constrastive_divergence_function( visible_layers='vis', hidden_layers='hid', optimizer=SimpleGradientDescent(eta=0.01), n_gibbs=1, persistent=True).compile() free_energy_of_first_layer = dbn.get_free_energy_function( visible_layers='vis', hidden_layers='hid').compile() train_second_layer = dbn.get_constrastive_divergence_function( visible_layers=('hid', 'lab'), hidden_layers='ass', input_layers=('vis', 'lab'), n_gibbs=1, persistent=True).compile() predict_label = dbn.get_inference_function(input_layers='vis', output_layers='lab', path=[('vis', 'hid'), ('hid', 'ass'), ('ass', 'lab')], smooth=True).compile() encode_label = OneHotEncoding(n_classes=10) # Step 1: Train the first layer, plotting the weights and persistent chain state. if plot: train_first_layer.set_debug_variables( lambda: { 'weights': dbn._bridges['vis', 'hid']._w.T.reshape((-1, 28, 28)), 'smooth_vis_state': dbn.get_inference_function('hid', 'vis', smooth=True). symbolic_stateless(*train_first_layer.locals()[ 'initial_hidden']).reshape((-1, 28, 28)) }) plotter = LiveStream(train_first_layer.get_debug_values) for i, (n_samples, visible_data, label_data) in enumerate( dataset.training_set.minibatch_iterator( minibatch_size=minibatch_size, epochs=n_training_epochs_1, single_channel=True)): train_first_layer(visible_data) if i % check_period == 0: print 'Free Energy of Test Data: %s' % (free_energy_of_first_layer( dataset.test_set.input).mean()) if plot: plotter.update() # Step 2: Train the second layer and simultanously compute the classification error from forward passes. if plot: train_second_layer.set_debug_variables( lambda: { 'w_vis_hid': dbn._bridges['vis', 'hid']._w.T.reshape((-1, 28, 28)), 'w_hid_ass': dbn._bridges['hid', 'ass']._w, 'w_lab_ass': dbn._bridges['hid', 'ass']._w, 'associative_state': train_second_layer.locals()['sleep_hidden'][0].reshape( (-1, 20, 25)), 'hidden_state': train_second_layer.locals()['sleep_visible'][0].reshape( (-1, 20, 25)), 'smooth_vis_state': dbn.get_inference_function('hid', 'vis', smooth=True). symbolic_stateless(train_second_layer.locals()['sleep_visible'] [0]).reshape((-1, 28, 28)) }) plotter = LiveStream(train_first_layer.get_debug_values) for i, (n_samples, visible_data, label_data) in enumerate( dataset.training_set.minibatch_iterator( minibatch_size=minibatch_size, epochs=n_training_epochs_2, single_channel=True)): train_second_layer(visible_data, encode_label(label_data)) if i % check_period == 0: out, = predict_label(dataset.test_set.input) score = percent_argmax_correct(actual=out, target=dataset.test_set.target) print 'Classification Score: %s' % score if plot: plotter.update()
def get_constrastive_divergence_function(self, visible_layers, hidden_layers, input_layers = None, up_path = None, n_gibbs = 1, persistent = False, optimizer = SimpleGradientDescent(eta = 0.1)): """ Make a symbolic function that does one step of contrastive divergence given a minibatch of input data. :param visible_layers: The visible layers of the RBM to be trained :param hidden_layers: The hidden layers of the RBM to be trained :param input_layers: The input layers (if not the same as the visible), whose activations will have to be passed up to the visible layers before training. :param up_path: The path from the input_layers to the hidden_layers (in the future this should be found automatically - now it is only computed automatically if there's a direct connection from input to visible) :param n_gibbs: Number of Gibbs block sampling steps to do :param persistent: True for pCD, false for regular :param optimizer: An IGradientOptimizer object. :return: A symbolic function of upate form: [(param_0, new_param_0), ...(persistent_state_0, new_persistent_state_0), ...] = func(in_0, in_1, ..._) That updates parameters in the specified RBM, and persistent state if persistent=True. """ visible_layers = visible_layers if isinstance(visible_layers, (list, tuple)) else (visible_layers, ) hidden_layers = hidden_layers if isinstance(hidden_layers, (list, tuple)) else (hidden_layers, ) if input_layers is None: assert set(visible_layers).issubset(self._graph.get_input_variables()), "If you don't specify input layers, "\ "the visible layers must be inputs to the graph. But they are not. Visible layers: %s, Input layers: %s" \ % (visible_layers, self._graph.get_input_variables().keys()) elif up_path is None: up_path = self.get_inference_function(input_layers = input_layers, output_layers = visible_layers) else: up_path = self._graph.get_execution_path(up_path) propup = self.get_inference_function(visible_layers, hidden_layers) free_energy = self.get_free_energy_function(visible_layers, hidden_layers) @symbolic_updater def cd_function(*input_signals): if input_layers is None: wake_visible = input_signals else: wake_visible, _ = up_path(*input_signals) wake_hidden, _ = propup(*wake_visible) initial_hidden =[theano.shared(np.zeros(wh.tag.test_value.shape, dtype = theano.config.floatX), name = 'persistent_hidden_state') for wh in wake_hidden] \ if persistent else wake_hidden gibbs_path = [(hidden_layers, visible_layers)] + [(visible_layers, hidden_layers), (hidden_layers, visible_layers)] * (n_gibbs-1) sleep_visible, _ = self.get_inference_function(hidden_layers, visible_layers, gibbs_path)(*initial_hidden) sleep_hidden, _ = propup(*sleep_visible) free_energy_difference = free_energy(*wake_visible).mean() - free_energy(*sleep_visible).mean() all_params = sum([x.parameters for x in ([self._layers[i] for i in visible_layers] +[self._layers[i] for i in hidden_layers]+[self._bridges[i, j] for i in visible_layers for j in hidden_layers])], []) updates = optimizer(cost = free_energy_difference, parameters = all_params, constants = wake_visible+sleep_visible) if persistent: updates += [(p, s) for p, s in zip(initial_hidden, sleep_hidden)] return updates return cd_function
def demo_rbm_mnist(plot=True, test_mode=False): """ In this demo we train an RBM on the MNIST input data (labels are ignored). We plot the state of a markov chanin that is being simulaniously sampled from the RBM, and the parameters of the RBM. What you see: A plot will appear with 6 subplots. The subplots are as follows: hidden-neg-chain: The activity of the hidden layer for each of the persistent CD chains for draewing negative samples. visible-neg-chain: The probabilities of the visible activations corresponding to the state of hidden-neg-chain. w: A subset of the weight vectors, reshaped to the shape of the input. b: The bias of the hidden units. b_rev: The bias of the visible units. visible-sample: The probabilities of the visible samples drawin from an independent free-sampling chain (outside the training function). As learning progresses, visible-neg-chain and visible-sample should increasingly resemble the data. """ set_enable_omniscence(True) minibatch_size = 9 n_epochs = 0.01 if test_mode else 10 dataset = get_mnist_dataset().process_with( inputs_processor=lambda (x, ): (x.reshape(x.shape[0], -1), )) rbm = simple_rbm( visible_layer=StochasticLayer('bernoulli'), bridge=FullyConnectedBridge( w=0.001 * np.random.randn(28 * 28, 500).astype(theano.config.floatX), b=0, b_rev=0), hidden_layer=StochasticLayer('bernoulli')) train_function = rbm.get_training_fcn( n_gibbs=4, persistent=True, optimizer=SimpleGradientDescent(eta=0.01)).compile() sampling_function = rbm.get_free_sampling_fcn( init_visible_state=np.random.randn(9, 28 * 28), return_smooth_visible=True).compile() if plot: def debug_variable_setter(): lv = train_function.symbolic.locals() return { 'hidden-neg-chain': lv.sleep_hidden.reshape((-1, 25, 20)), 'visible-neg-chain': lv.hidden_layer.smooth(lv.bridge.reverse( lv.sleep_hidden)).reshape((-1, 28, 28)), 'w': lv.bridge.parameters[0].T[:25].reshape((-1, 28, 28)), 'b': lv.bridge.parameters[1].reshape((25, 20)), 'b_rev': lv.bridge.parameters[2].reshape((28, 28)), } train_function.set_debug_variables(debug_variable_setter) stream = LiveStream(lambda: dict(train_function.get_debug_values().items( ) + [('visible-sample', visible_samples.reshape((-1, 28, 28)))]), update_every=10) for _, visible_data, _ in dataset.training_set.minibatch_iterator( minibatch_size=minibatch_size, epochs=n_epochs, single_channel=True): visible_samples, _ = sampling_function() train_function(visible_data) if plot: stream.update()